This report was generated on 2022-08-29 18:09:42. R version: 4.2.0 on x86_64-apple-darwin17.0. For this report, CRAN packages as of 2022-05-01 were used.
…
The preprocessing and analysis of the data was conducted in the R project for statistical
computing. The RMarkdown script used to generate this document and
all the resulting data can be downloaded under
this link. Through executing main.Rmd, the herein
described process can be reproduced and this document can be generated.
In the course of this, data from the folder input will be
processed and results will be written to output. The html
on-line version of the analysis can be accessed through this link.
The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/r-data-manipulation.
…
abc.csv (Example)| Attribute | Type | Description |
|---|---|---|
| a | Numeric | … |
| b | Numeric | … |
| c | Numeric | … |
…
## [1] "package package:rmarkdown detached"
# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse)
library(data.table)
library(tidylog)
library(jsonlite)
library(lintr)
library(rmarkdown)
library(rio)
library(cowplot)
library(extrafont)
library(ggrepel)
library(scales)
library(pacman)
library(htmltab)
library(rmiscutils)
library(RSQLite)
library(fs)
library(openxlsx)
library(waldo)
library(vcdExtra)
library(psych)
library(Hmisc)
library(skimr)
library(gapminder)
library(lsr)
library(chron)
library(plm)
library(randomNames)
library(encryptr)
library(robotstxt)
library(janitor)",
file = "manifest.R")# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
if (!require(devtools)) {
install.packages("devtools", repos = "http://cran.us.r-project.org")
require(devtools)
}
devtools::install_github("RevolutionAnalytics/checkpoint",
ref = "v0.3.2", # could be adapted later,
# as of now (beginning of July 2017
# this is the current release on CRAN)
repos = "http://cran.us.r-project.org")
require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(snapshot_date = package_date,
project = path_to_wd,
verbose = T,
scanForPackages = T,
use.knitr = F,
R.version = r_version)
rm(package_date)source("manifest.R")
unlink("manifest.R")
sessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/UTF-8/C/C/C/C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] janitor_2.1.0 robotstxt_0.7.13 encryptr_0.1.3
## [4] randomNames_1.5-0.0 plm_2.6-1 chron_2.3-56
## [7] lsr_0.5.2 gapminder_0.3.0 skimr_2.1.4
## [10] Hmisc_4.7-0 Formula_1.2-4 survival_3.3-1
## [13] lattice_0.20-45 psych_2.2.3 vcdExtra_0.8-0
## [16] gnm_1.1-2 vcd_1.4-9 waldo_0.4.0
## [19] openxlsx_4.2.5 fs_1.5.2 RSQLite_2.2.13
## [22] rmiscutils_0.2 htmltab_0.8.2 pacman_0.5.1
## [25] scales_1.2.0 ggrepel_0.9.1 extrafont_0.18
## [28] cowplot_1.1.1 rio_0.5.29 rmarkdown_2.14
## [31] lintr_2.0.1 jsonlite_1.8.0 tidylog_1.0.2
## [34] data.table_1.14.2 forcats_0.5.2 stringr_1.4.1
## [37] dplyr_1.0.9 purrr_0.3.4 readr_2.1.2
## [40] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6
## [43] tidyverse_1.3.1 checkpoint_1.0.2 rstudioapi_0.14
## [46] knitr_1.40
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.0 backports_1.4.1 miscTools_0.6-26
## [4] repr_1.1.4 lazyeval_0.2.2 splines_4.2.0
## [7] digest_0.6.29 ca_0.71.1 htmltools_0.5.3
## [10] fansi_1.0.3 relimp_1.0-5 magrittr_2.0.3
## [13] checkmate_2.1.0 memoise_2.0.1 cluster_2.1.3
## [16] tzdb_0.3.0 remotes_2.4.2 toOrdinal_1.3-0.0
## [19] modelr_0.1.8 sandwich_3.0-1 extrafontdb_1.0
## [22] bdsmatrix_1.3-4 jpeg_0.1-9 colorspace_2.0-3
## [25] blob_1.2.3 rvest_1.0.2 haven_2.5.0
## [28] rbibutils_2.2.8 xfun_0.32 callr_3.7.2
## [31] crayon_1.5.1 zoo_1.8-10 glue_1.6.2
## [34] gtable_0.3.0 Rttf2pt1_1.3.10 maxLik_1.5-2
## [37] DBI_1.1.3 Rcpp_1.0.9 htmlTable_2.4.1
## [40] foreign_0.8-82 bit_4.0.4 clisymbols_1.2.0
## [43] collapse_1.7.6 htmlwidgets_1.5.4 rex_1.2.1
## [46] httr_1.4.4 qvcalc_1.0.2 RColorBrewer_1.1-3
## [49] ellipsis_0.3.2 pkgconfig_2.0.3 nnet_7.3-17
## [52] sass_0.4.2 dbplyr_2.1.1 deldir_1.0-6
## [55] utf8_1.2.2 tidyselect_1.1.2 rlang_1.0.4
## [58] munsell_0.5.0 cellranger_1.1.0 tools_4.2.0
## [61] cachem_1.0.6 cli_3.3.0 generics_0.1.3
## [64] broom_0.8.0 evaluate_0.16 fastmap_1.1.0
## [67] yaml_2.3.5 processx_3.7.0 bit64_4.0.5
## [70] zip_2.2.0 nlme_3.1-157 xml2_1.3.3
## [73] brio_1.1.3 compiler_4.2.0 curl_4.3.2
## [76] png_0.1-7 testthat_3.1.4 reprex_2.0.1
## [79] bslib_0.4.0 stringi_1.7.8 cyclocomp_1.1.0
## [82] ps_1.7.1 desc_1.4.1 Matrix_1.4-1
## [85] vctrs_0.4.1 pillar_1.8.1 lifecycle_1.0.1
## [88] Rdpack_2.3 lmtest_0.9-40 jquerylib_0.1.4
## [91] R6_2.5.1 latticeExtra_0.6-30 gridExtra_2.3
## [94] MASS_7.3-57 assertthat_0.2.1 rprojroot_2.0.3
## [97] withr_2.5.0 mnormt_2.1.0 parallel_4.2.0
## [100] hms_1.1.2 rpart_4.1.16 snakecase_0.11.0
## [103] lubridate_1.8.0 base64enc_0.1-3 interp_1.1-3
# if you want to outsource logic to other script files, see README for
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")
knitr::read_chunk("scripts/outliers.R")
source("scripts/outliers.R")There are some different ways to do this, depending on how your data is formatted and where it’s located.
R provides a nice GUI for editing tabular data: the data editor.
The c function has already been introduced as a way to
input small amounts of data into R. When the amount of data is large,
and especially when typing the data into the console is inappropriate,
the scan function can be used. scan is most
appropriate when all the data to be read is of the same mode, so that it
can be accommodated by a vector or matrix. The first argument to
scan can be a quoted string or character variable
containing the name of a file or a URL, or it can be any of a number of
connections to allow other input sources. If no first argument is given,
scan will read from the console, stopping when a completely blank line
is entered. By default, scan expects all of its input to be
numeric data; this can be overridden with the what=
argument, which specifies the type of data that scan will see.
When reading from the console, R will prompt you with the index of the next item to be entered, and report on the number of elements read when it’s done. If the what= argument to scan is a list containing examples of the expected data types, scan will output a list with as many elements as there are data types provided. To specify numeric values, you can pass a value of 0.
Note that, by naming the elements in the list passed through the
what= argument, the output list elements are appropriately
named. When the argument to what= is a list, the
multi.line= option can be set to FALSE to prevent scan from
trying to use multiple lines to read the records for an observation. One
of the most common uses for scan is to read in data matrices. Since scan
returns a vector, a call to scan can be embedded inside a call to the
matrix function.
salary <- c(18700000, 14626720, 14137500, 13980000, 12916666)
position <- c("QB", "QB", "DE", "QB", "QB")
team <- c("Colts", "Patriots", "Panthers", "Bengals", "Giants")
name.last <- c("Manning", "Brady", "Pepper", "Palmer", "Manning")
name.first <- c("Peyton", "Tom", "Julius", "Carson", "Eli")
top.5.salaries <- data.frame(name.last, name.first, team, position, salary)
top.5.salaries## name.last name.first team position salary
## 1 Manning Peyton Colts QB 18700000
## 2 Brady Tom Patriots QB 14626720
## 3 Pepper Julius Panthers DE 14137500
## 4 Palmer Carson Bengals QB 13980000
## 5 Manning Eli Giants QB 12916666
# calling the built-in data editor
# top.5.salaries <- edit(top.5.salaries)
# fix(top.5.salaries)
# using scan
# names <- scan(what = "")
# names
# names2 = scan(what=list(a=0,b="",c=0))
# names2
# creating a matrix
# mymat <- matrix(scan(), ncol = 3, byrow = TRUE)
# mymatR includes a family of functions for importing delimited text files
into R, based on the read.table function. The
read.table function reads a text file into R and returns a
data.frame object. Each row in the input file is interpreted as an
observation. Each column in the input file represents a variable. The
read.table function expects each field to be separated by a delimiter.
The most important options are sep and header.
R includes a set of convenience functions that call
read.table with different default options for these values.
Besides that, you can fetch a CSV file from a single URL.
read.table options
Although not as common as white-space-, tab-, or comma-separated
data, sometimes input data is stored with no delimiters between the
values, but with each variable occupying the same columns on each line
of input. In cases like this, the read.fwf function can be
used. The widths= argument can be a vector containing the
widths of the fields to be read, using negative numbers to indicate
columns to be skipped. If the data for each observation occupies more
than one line, widths= can be a list of as many vectors as
there are lines per observation. The header=,
row.names=, and col.names= arguments behave
similarly to those in read.table.
snowdata <- read.table("input/BostonWinterSnowfalls.csv", header = TRUE, sep = ",", quote = "\"")
# getting data online
sp500 <- read.csv("http://bit.ly/BostonSnowfallCSV", sep="")
# getting data with no delimiters
ff <- tempfile()
cat(file = ff, "New York, NY 66,834.6
Kings, NY 34,722.9
Bronx, NY 31,729.8
Queens, NY 20,453.0
San Francisco, CA 16,526.2
Hudson, NJ 12,956.9
Suffolk, MA 11,691.6
Philadelphia, PA 11,241.1
Washington, DC 9,378.0
Alexandria IC, VA 8,552.2")
city <- read.fwf(ff, widths = c(18, -19, 8), as.is = TRUE)
city## V1 V2
## 1 New York, NY 66,83 NA
## 2 Kings, NY 34,7 NA
## 3 Bronx, NY 31,7 NA
## 4 Queens, NY 20, NA
## 5 San Francisco, NA
## 6 Hudson, NJ 12, NA
## 7 Suffolk, MA 11 NA
## 8 Philadelphia, NA
## 9 Washington, DC NA
## 10 Alexandria IC, NA
R can also export R data objects (usually data frames and matrices)
as text files. To export data to a text file, use the
write.table function.There are wrapper functions for
write.table that call write.table with
different defaults. These are useful if you want to create a file of
comma-separated values.
# write.table(snowdata, file = "output/snowdata.txt", quote = FALSE, sep = ",", row.names = FALSE)
# write.csv(snowdata, file = "output/snowdata.csv", row.names = FALSE)In order to connect directly to a database from R, you will need to install some optional packages. The packages you need depend on the database(s) to which you want to connect and the connection method you want to use.
There are two sets of database interfaces available in R:
RODBC. The RODBC package allows R to fetch data from ODBC (Open DataBase Connectivity) connections. ODBC provides a standard interface for different programs to connect to databases.
DBI. The DBI package allows R to connect to databases using native database drivers or JDBC drivers. This package provides a common database abstraction for R software.
DBI is not a single package, but instead is a framework and set of packages for accessing databases.One important difference between the DBI packages and the RODBC package is in the objects they use: DBI uses S4 objects to represent drivers, connections, and other objects. To open a connection with DBI, use the dbConnect function. The argument drv can be a DBIDriver object or a character value describing the driver to use. You can generate a DBIDriver object with a call to the DBI driver. The dbConnect function can take additional options, depending on the type of database you are using. For SQLite databases, the most important argument is dbname (which specifies the database file). Check the help files for the database you are using for more options. Even arguments for parameters like usernames are not the same between databases.
As example we can easily copy an R data frame into a SQLite database with dbWriteTable().
# to connect with an external database
# drv <- dbDriver("SQLite")
# con <- dbConnect(drv, dbname = system.file("extdata", "bb.db", package = "nutshell"))
# creating our database
mydb <- dbConnect(RSQLite::SQLite(), "")
dbWriteTable(mydb, "mtcars", mtcars)
dbWriteTable(mydb, "iris", iris)
dbListTables(mydb)## [1] "iris" "mtcars"
# Issue a query with dbGetQuery()
dbGetQuery(mydb, 'SELECT * FROM mtcars LIMIT 5')## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
# disconnecting from dabase
dbDisconnect(mydb)El paquete readr, incluido en la familia tidyverse, admite la lectura
de múltiples formatos de archivo usando funciones que comienzan por
read_* o write_*. Por defecto, la función
read_excel() importa la primera hoja. Para importar una
hoja diferente es necesario indicarlo con el argumento sheet o bien el
número o el nombre (segundo argumento). La función más importante para
leer múltiples hojas es map() del paquete purrr
que forma para de la colección de paquetes tidyverse. map()
permite aplicar una función a cada elemento de un vector o lista. Existe
una variante de map() que directamente nos une todas las
tablas por fila: map_df(). Si fuese necesario unir por
columna, se debería usar map_dfc().
La función dir_ls() del paquete fs (https://github.com/r-lib/fs), a pesar de ser similar a
dir() de R Base, tiene algunas ventajas, como su total
compatibilidad con la colección de funciones de tidyverse. Por
ejemplo, el argumento regexp permite buscar con una
expresión regular un patrón en las rutas y ficheros.
Funciones en readr
Most of us would probably read the CSV file first and then do the
data cleaning. For example, using the clean_names function
from the janitor package. The same can be achieved inside
read_csv with the function make_clean_names
for the name_repair argument. The function uses the snake
naming convention by default. Snake converts all names to lowercase and
separates words with an underscore. Besides that,
clean_names does not work with vectors, but
make_clean_names does.
List of all naming conventions
With make_clean_names you can also replace certain characters from
the column names. If you are familiar with regular expressions, you can
make more complex replacements. Apart from cleaning your column names,
you can also select columns directly from read_csv using
the col_select argument.
# janitor approach
mpg_new <- read_csv("input/mpg_uppercase.csv", show_col_types = FALSE) %>%
janitor::clean_names() %>%
select(c(manufacturer, model)) %>%
glimpse()## Rows: 6
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4"
# tidyverse approach
read_csv("input/mpg_uppercase.csv", name_repair = make_clean_names, show_col_types = FALSE) %>%
glimpse()## Rows: 6
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4"
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8
## $ year <dbl> 1999, 1999, 2008, 2008, 1999, 1999
## $ cyl <dbl> 4, 4, 4, 4, 6, 6
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
## $ cty <dbl> 18, 21, 20, 21, 16, 18
## $ hwy <dbl> 29, 29, 31, 30, 26, 26
## $ fl <chr> "p", "p", "p", "p", "p", "p"
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "c…
# replacing and removing character strings with make_clean_names
make_clean_names(c("A", "B%", "C"), replace = c("%" = "_percent"))## [1] "a" "b_percent" "c"
# with reg expressions
make_clean_names(c("A_1", "B_1", "C_1"), replace = c("^A_" = "a"))## [1] "a1" "b_1" "c_1"
# snake naming convention per default
make_clean_names(c("myHouse", "MyGarden"), case = "snake")## [1] "my_house" "my_garden"
make_clean_names(c("myHouse", "MyGarden"), case = "none")## [1] "myHouse" "MyGarden"
read_csv("input/mpg_uppercase.csv", show_col_types = FALSE, name_repair = ~ make_clean_names(., case = "upper_camel")) %>% # The dot . in make_clean_names denotes the vector of column names.
glimpse()## Rows: 6
## Columns: 11
## $ Manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ Model <chr> "a4", "a4", "a4", "a4", "a4", "a4"
## $ Displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8
## $ Year <dbl> 1999, 1999, 2008, 2008, 1999, 1999
## $ Cyl <dbl> 4, 4, 4, 4, 6, 6
## $ Trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ Drv <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
## $ Cty <dbl> 18, 21, 20, 21, 16, 18
## $ Hwy <dbl> 29, 29, 31, 30, 26, 26
## $ Fl <chr> "p", "p", "p", "p", "p", "p"
## $ Class <chr> "compact", "compact", "compact", "compact", "compact", "c…
# selecting specific columns
read_csv("input/mpg_uppercase.csv", show_col_types = FALSE, name_repair = make_clean_names, col_select = c(manufacturer, model)) %>%
glimpse()## Rows: 6
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4"
Column names often contain spaces, special characters, or are written
in a mixture of lower and upper case characters. Such poorly formatted
column names can lead to numerous problems. We could easily solve these
issues with the rename function but this approach does not
scale. The main difference between remain and remain_with
is that remain_with changes the column names using a function. The three
main arguments of the function are .data, .fn
and .cols. .data stands for the data frame,
.fn for the function to apply to the column names, and
.cols for the columns to apply the function to.
Use a specific naming convention for column names using the
make_clean_names function from the janitor
package. It is used the tilde operator to indicate an anonymous
function. This shortcut is needed whenever you need to call certain
arguments of a function.
Another use case of rename_with is the replacement of
characters. We use the gsub function to replace a specific
character. Alternatively, we could have used the
str_replace function. With pattern we said
that we are looking for a group of characters containing one or more
digits (\d+). \d+ is a regular expression. A group in the
argument pattern is everything between two brackets. With
replacement we said that we want to put an underscore in
front of this group. The group itself is specified by \1. If we had two
groups, the second group would be specified by \2.
You can use .cols to specify which column names to apply
the function to. And you can even use our tidyselect functions
for that. Another useful function is matches. With
matches, you can search for specific patterns in your
column names and apply a function to the column names that match the
pattern.
mpg %>%
rename_with(
.fn = toupper,
.cols = everything()
)## # A tibble: 234 × 11
## MANUFACTURER MODEL DISPL YEAR CYL TRANS DRV CTY HWY FL CLASS
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
# the same as
mpg %>%
rename_with(
.fn = toupper,
.cols = everything()
) %>%
colnames()## [1] "MANUFACTURER" "MODEL" "DISPL" "YEAR" "CYL"
## [6] "TRANS" "DRV" "CTY" "HWY" "FL"
## [11] "CLASS"
# using janitor to build a specific name convention
iris %>%
rename_with(~ janitor::make_clean_names(., case = "big_camel")) %>%
colnames()## [1] "SepalLength" "SepalWidth" "PetalLength" "PetalWidth" "Species"
# replacing characters
mpg %>%
rename_with(~ gsub("e", "_", .)) %>%
colnames()## [1] "manufactur_r" "mod_l" "displ" "y_ar" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
# or
mpg %>%
rename_with(~ str_replace(., "e", "_")) %>%
colnames()## [1] "manufactur_r" "mod_l" "displ" "y_ar" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
# replacing characters using grouping function
anscombe %>%
rename_with(~ str_replace(.,
pattern = "(\\d+)",
replacement = "_\\1"
)) %>%
colnames()## [1] "x_1" "x_2" "x_3" "x_4" "y_1" "y_2" "y_3" "y_4"
# renaming variables for specific variables
anscombe %>%
rename_with(~ str_replace(
., "([:alpha:])([1-2])",
"\\1psilon\\2_"
), c(y1, y2)) %>%
colnames()## [1] "x1" "x2" "x3" "x4" "ypsilon1_" "ypsilon2_"
## [7] "y3" "y4"
# using tidyverse functions
mpg %>%
rename_with(~ toupper(.), where(is.numeric)) %>%
colnames()## [1] "manufacturer" "model" "DISPL" "YEAR" "CYL"
## [6] "trans" "drv" "CTY" "HWY" "fl"
## [11] "class"
iris %>%
rename_with(
~ str_replace(., "\\.", "_"),
starts_with("Sepal")
) %>%
colnames()## [1] "Sepal_Length" "Sepal_Width" "Petal.Length" "Petal.Width" "Species"
# using matches
iris %>%
rename_with(
~ str_replace(., "\\.", "_"),
matches("[Ww]idth$")
) %>%
colnames()## [1] "Sepal.Length" "Sepal_Width" "Petal.Length" "Petal_Width" "Species"
You don”t always read just one file into R. It is not uncommon for
your data to be scattered in hundreds or thousands of files. Of course,
you don”t want to read these files into R manually. So you need an
automatic method for reading in files. Before we can read the files into
R, we need to create a character vector of the file paths. You have
several options to create such a vector. You can use the R base function
list.files, which returns character vectors of the names of
files in a directory or you use the function dir_ls from
the fs package. The other option is to use the
dir_ls function from the fs package. fs
provides a cross-platform interface for accessing files on your hard
disk. It supports all file operations (deleting, creating files, moving
files, etc.).
Now that we know the file paths, we can load the files into R. The
tidyverse way to do this is to use the map_dfr function
from the purrr package. map_dfr loops through all
the file paths and binds the data frames into a single data frame. The
.x in the following code stands for the file name. To
output the actual csv files and not the filenames, we need to put
.x (the path) in a read_* function. In this
example we are working with CSV files. The trick works the same for all
rectangular file formats. Another approach is to use the
read_csv function directly by putting the character vector
of the file names directly into read_csv.
Sometimes your files are deeply nested. In that case, we need to
search through each folder recursively. If you try to load all csv files
from the nested_folders folder, you would get an empty vector. This is
because dir_ls does not look in the nested folders, but
only in the parent folder. To make dir_ls search through
the folders recursively, you need to set the recurse
argument to TRUE.
You don”t always need all the files in your directory and need to
remove some files from the list of file paths. A good way to do this is
to use the str_detect function from the stringr
package. The function returns logical values. To change the actual
character vector, we need to add these logical values to the character
vector itself. With the negate argument you can find only
the files that do not match the pattern.
horas_sol <- read_csv("input/SS_STAID001395.txt", skip = 19) |> # los datos empiezan en la linea 20
janitor::clean_names()
head(horas_sol)## # A tibble: 6 × 4
## souid date ss q_ss
## <dbl> <dbl> <dbl> <dbl>
## 1 120414 19560501 -9999 9
## 2 120414 19560502 -9999 9
## 3 120414 19560503 -9999 9
## 4 120414 19560504 -9999 9
## 5 120414 19560505 -9999 9
## 6 120414 19560506 -9999 9
# .xlsx files
# importing .xls file
emisiones <- readxl::read_xls("input/env_air_gge.xls", sheet = 1, skip = 362, n_max = 36)
head(emisiones)## # A tibble: 6 × 11
## GEO/TI…¹ `2007` `2008` `2009` `2010` `2011` `2012` `2013` `2014` `2015` `2016`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Europea… 4.33e5 4.30e5 4.25e5 4.20e5 4.20e5 4.18e5 4.21e5 4.28e5 4.30e5 4.30e5
## 2 Europea… 4.34e5 4.31e5 4.26e5 4.21e5 4.21e5 4.19e5 4.22e5 4.29e5 4.30e5 4.31e5
## 3 Belgium 1.03e4 1.02e4 1.03e4 1.02e4 1.01e4 9.92e3 9.96e3 1.02e4 1.01e4 9.90e3
## 4 Bulgaria 4.90e3 5.15e3 4.98e3 5.45e3 5.11e3 5.24e3 5.72e3 6.19e3 6.24e3 6.53e3
## 5 Czech R… 7.84e3 7.99e3 7.58e3 7.41e3 7.59e3 7.58e3 7.76e3 7.96e3 8.16e3 8.52e3
## 6 Denmark 1.08e4 1.07e4 1.05e4 1.04e4 1.04e4 1.04e4 1.03e4 1.05e4 1.04e4 1.05e4
## # … with abbreviated variable name ¹`GEO/TIME`
# iterate over multiple worksheets in a workbook
path <- "input/madrid_temp.xlsx"
mad <- path %>%
readxl::excel_sheets() %>%
set_names() %>%
map_df(readxl::read_excel,
path = path, .id = "yr"
)
head(mad)## # A tibble: 6 × 3
## yr date ta
## <chr> <dttm> <dbl>
## 1 2000 2000-01-01 00:00:00 5.4
## 2 2000 2000-01-02 00:00:00 5
## 3 2000 2000-01-03 00:00:00 3.5
## 4 2000 2000-01-04 00:00:00 4.3
## 5 2000 2000-01-05 00:00:00 0.6
## 6 2000 2000-01-06 00:00:00 3.8
# importing and reading several .xlsx files at once without merging
dir_ls("input", regexp = "xlsx") %>%
map(readxl::read_excel)## $`input/berlin_temp.xlsx`
## # A tibble: 366 × 2
## date ta
## <dttm> <dbl>
## 1 2000-01-01 00:00:00 1.2
## 2 2000-01-02 00:00:00 3.6
## 3 2000-01-03 00:00:00 5.7
## 4 2000-01-04 00:00:00 5.1
## 5 2000-01-05 00:00:00 2.2
## 6 2000-01-06 00:00:00 1.8
## 7 2000-01-07 00:00:00 4.2
## 8 2000-01-08 00:00:00 4.2
## 9 2000-01-09 00:00:00 4.2
## 10 2000-01-10 00:00:00 1.7
## # … with 356 more rows
##
## $`input/madrid_temp.xlsx`
## # A tibble: 366 × 2
## date ta
## <dttm> <dbl>
## 1 2000-01-01 00:00:00 5.4
## 2 2000-01-02 00:00:00 5
## 3 2000-01-03 00:00:00 3.5
## 4 2000-01-04 00:00:00 4.3
## 5 2000-01-05 00:00:00 0.6
## 6 2000-01-06 00:00:00 3.8
## 7 2000-01-07 00:00:00 6.2
## 8 2000-01-08 00:00:00 5.4
## 9 2000-01-09 00:00:00 5.5
## 10 2000-01-10 00:00:00 4.8
## # … with 356 more rows
# merging into a new column
data_df <- dir_ls("input", regexp = "xlsx") %>%
map_df(readxl::read_excel, .id = "city")
# cleaning city column
data_df <- mutate(data_df, city = path_file(city) %>%
path_ext_remove() %>%
str_replace("_temp", ""))
head(data_df)## # A tibble: 6 × 3
## city date ta
## <chr> <dttm> <dbl>
## 1 berlin 2000-01-01 00:00:00 1.2
## 2 berlin 2000-01-02 00:00:00 3.6
## 3 berlin 2000-01-03 00:00:00 5.7
## 4 berlin 2000-01-04 00:00:00 5.1
## 5 berlin 2000-01-05 00:00:00 2.2
## 6 berlin 2000-01-06 00:00:00 1.8
# .csv files
# adding new directory
# dir_create("input", c("many_files"))
# creating random samples from mpg data set
# mpg_samples <- map(1:25, ~ slice_sample(mpg, n = 20))
# adding .csv files from samples to the new directory
# iwalk(mpg_samples, ~ write_csv(., paste0("input/many_files/", .y, ".csv")))
# creating a character vector of file paths
# with list.files from Base-R
(csv_files_list_files <- list.files(path = "input/many_files", pattern = "csv", full.names = TRUE))## [1] "input/many_files/1.csv" "input/many_files/10.csv"
## [3] "input/many_files/11.csv" "input/many_files/12.csv"
## [5] "input/many_files/13.csv" "input/many_files/14.csv"
## [7] "input/many_files/15.csv" "input/many_files/16.csv"
## [9] "input/many_files/17.csv" "input/many_files/18.csv"
## [11] "input/many_files/19.csv" "input/many_files/2.csv"
## [13] "input/many_files/20.csv" "input/many_files/21.csv"
## [15] "input/many_files/22.csv" "input/many_files/23.csv"
## [17] "input/many_files/24.csv" "input/many_files/25.csv"
## [19] "input/many_files/3.csv" "input/many_files/4.csv"
## [21] "input/many_files/5.csv" "input/many_files/6.csv"
## [23] "input/many_files/7.csv" "input/many_files/8.csv"
## [25] "input/many_files/9.csv"
# with dir_ls from fs package
(csv_files_dir_ls <- dir_ls(path = "input/many_files/", glob = "*.csv", type = "file"))## input/many_files/1.csv input/many_files/10.csv input/many_files/11.csv
## input/many_files/12.csv input/many_files/13.csv input/many_files/14.csv
## input/many_files/15.csv input/many_files/16.csv input/many_files/17.csv
## input/many_files/18.csv input/many_files/19.csv input/many_files/2.csv
## input/many_files/20.csv input/many_files/21.csv input/many_files/22.csv
## input/many_files/23.csv input/many_files/24.csv input/many_files/25.csv
## input/many_files/3.csv input/many_files/4.csv input/many_files/5.csv
## input/many_files/6.csv input/many_files/7.csv input/many_files/8.csv
## input/many_files/9.csv
# another example using map.df(), list.files() and rio::import()
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import)
head(contributions)## Date Contributor Address City State Zip
## 1 7/30/2017 Curt DeChicco <NA> <NA> MA <NA>
## 2 7/30/2017 Nicolle Eduardo <NA> <NA> MA <NA>
## 3 7/30/2017 Heidi Vasconcelos <NA> <NA> MA <NA>
## 4 7/30/2017 Robert Fair 656 Grove St Framingham MA 01701
## 5 7/30/2017 Jonates Azevedo 2 Dell Ann Circle Milford MA 01757
## 6 7/30/2017 Horrigan Jennifer <NA> <NA> MA <NA>
## Occupation Employer Amount Recipient
## 1 <NA> <NA> 40 Horrigan, Joshua Paul
## 2 <NA> <NA> 20 Horrigan, Joshua Paul
## 3 <NA> <NA> 20 Horrigan, Joshua Paul
## 4 Jewell Insurance Self 300 Horrigan, Joshua Paul
## 5 <NA> <NA> 100 Horrigan, Joshua Paul
## 6 <NA> <NA> 20 Horrigan, Joshua Paul
# using janitor::tabyl() function to count number of rows within a group
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import) %>%
filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
distinct(Contributor, Address, .keep_all = TRUE) %>%
tabyl(Recipient, sort = TRUE) %>%
# mutate(percent = round(percent * 100, 1)) %>%
select(Candidate = Recipient, Pct_Local_Contributors = percent)
contributions## Candidate Pct_Local_Contributors
## Horrigan, Joshua Paul 0.035820896
## Neves-Grigg, Sr., Benjaman 0.011940299
## Sen, Dhruba 0.008955224
## Sousa, Priscila 0.029850746
## Spicer, Dr. Yvonne M. 0.516417910
## Stefanini, John A. 0.337313433
## Tilden, Mark S. 0.059701493
# using adorn_percentages()
results <- readr::read_csv("input/election_framingham_mayor_2017_09.csv", col_names = TRUE) %>%
dplyr::select(Candidate, Totals)
results## # A tibble: 9 × 2
## Candidate Totals
## <chr> <dbl>
## 1 Blanks 56
## 2 Joshua Paul Horrigan 545
## 3 John A. Stefanini 3184
## 4 Dhruba P. Sen 101
## 5 Mark S. Tilden 439
## 6 Yvonne M. Spicer 5967
## 7 Benjaman A. Neves-Grigg, 134
## 8 Priscila Sousa 538
## 9 Write-Ins 42
results <- results %>%
filter(!(Candidate %in% c("Blanks", "Write-Ins"))) %>%
adorn_percentages(denominator = "col") %>%
rename(Pct_Vote = Totals)
results## Candidate Pct_Vote
## Joshua Paul Horrigan 0.049963330
## John A. Stefanini 0.291895856
## Dhruba P. Sen 0.009259259
## Mark S. Tilden 0.040245691
## Yvonne M. Spicer 0.547029703
## Benjaman A. Neves-Grigg, 0.012284562
## Priscila Sousa 0.049321599
contributions_split <- tidyr::separate(
contributions, Candidate,
c("LastName", "FirstName"), ", ", 2
) %>%
select(-FirstName)
head(contributions_split)## LastName Pct_Local_Contributors
## Horrigan 0.035820896
## Neves-Grigg 0.011940299
## Sen 0.008955224
## Sousa 0.029850746
## Spicer 0.516417910
## Stefanini 0.337313433
results_split <- tidyr::separate(results, Candidate, c("FirstName", "MiddleName", "LastName"), " ")
tail(results_split)## # A tibble: 6 × 4
## FirstName MiddleName LastName Pct_Vote
## <chr> <chr> <chr> <dbl>
## 1 John A. Stefanini 0.292
## 2 Dhruba P. Sen 0.00926
## 3 Mark S. Tilden 0.0402
## 4 Yvonne M. Spicer 0.547
## 5 Benjaman A. Neves-Grigg, 0.0123
## 6 Priscila Sousa <NA> 0.0493
results_split %<>%
mutate(
LastName = ifelse(is.na(LastName), MiddleName, LastName),
LastName = str_replace(LastName, ",", "")
) %>%
select(-FirstName, -MiddleName)
tail(results_split)## # A tibble: 6 × 2
## LastName Pct_Vote
## <chr> <dbl>
## 1 Stefanini 0.292
## 2 Sen 0.00926
## 3 Tilden 0.0402
## 4 Spicer 0.547
## 5 Neves-Grigg 0.0123
## 6 Sousa 0.0493
# reading the files from a character vector of paths
data_frames <- map_dfr(csv_files_dir_ls, ~ read_csv(.x, show_col_types = FALSE))
glimpse(data_frames)## Rows: 500
## Columns: 11
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
# and with a new column representing the file name
map_dfr(csv_files_dir_ls, ~ read_csv(.x, , show_col_types = FALSE) %>%
mutate(filename = .x)) %>%
glimpse()## Rows: 500
## Columns: 12
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
## $ filename <fs::path> "input/many_files/1.csv", "input/many_files/1.csv", …
# using directly read_csv
read_csv(csv_files_dir_ls, id = "filename", show_col_types = FALSE) %>%
glimpse## Rows: 500
## Columns: 12
## $ filename <chr> "input/many_files/1.csv", "input/many_files/1.csv", "inpu…
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
# inconsistent column names
# generating the samples with inconsistent column names
mpg_samples2 <- map(1:10, ~ slice_sample(mpg, n = 20))
inconsistent_dframes <- map(mpg_samples2, ~ janitor::clean_names(dat = .x, case = "random"))
map(inconsistent_dframes, ~ colnames(.x)) %>%
head## [[1]]
## [1] "MAnUfAcTuRER" "MOdEl" "dISPl" "yeaR" "cyl"
## [6] "trANs" "dRV" "cty" "hwY" "Fl"
## [11] "cLaSs"
##
## [[2]]
## [1] "MANuFaCtuReR" "mODEL" "diSpl" "year" "CYl"
## [6] "TrANs" "drV" "Cty" "HWY" "fl"
## [11] "CLAsS"
##
## [[3]]
## [1] "mANufaCTuRER" "mODeL" "DIsPl" "YeAR" "CyL"
## [6] "traNS" "DRV" "CTY" "hwY" "fL"
## [11] "ClasS"
##
## [[4]]
## [1] "maNUFaCturEr" "mODEl" "dISpl" "year" "CYL"
## [6] "TrAns" "Drv" "CTy" "HwY" "FL"
## [11] "Class"
##
## [[5]]
## [1] "MaNuFAcTUrER" "model" "disPl" "yEaR" "cYl"
## [6] "TRANs" "dRv" "ctY" "hWY" "Fl"
## [11] "CLASs"
##
## [[6]]
## [1] "MAnUfActUrER" "moDEl" "dIsPl" "YeAR" "CYl"
## [6] "TRANS" "DrV" "cTY" "hWY" "fL"
## [11] "ClAsS"
# selecting a random set of columns per data frame
inconsistent_dframes <- map(inconsistent_dframes, ~ .x[sample(1:length(.x), sample(1:length(.x), 1))])
map(inconsistent_dframes, ~ colnames(.x)) %>%
head()## [[1]]
## [1] "cty" "cLaSs" "MOdEl" "dRV" "cyl"
## [6] "MAnUfAcTuRER" "yeaR" "Fl" "hwY" "dISPl"
## [11] "trANs"
##
## [[2]]
## [1] "CYl" "CLAsS" "diSpl" "Cty" "HWY" "drV"
##
## [[3]]
## [1] "fL" "mODeL" "CTY" "YeAR" "ClasS"
## [6] "DIsPl" "CyL" "mANufaCTuRER" "DRV" "hwY"
##
## [[4]]
## [1] "FL" "HwY" "CYL" "CTy"
##
## [[5]]
## [1] "MaNuFAcTUrER" "yEaR" "disPl"
##
## [[6]]
## [1] "hWY"
# saving to disk
# dir_create(c("input/unclean_files"))
# iwalk(inconsistent_dframes, ~ write_csv(.x, paste0("input/unclean_files/", .y, ".csv")))
# loading and cleaning the data frames
many_columns_data_frame <- dir_ls(path = "input/unclean_files/", glob = "*.csv", type = "file") %>%
map_dfr(~ read_csv(.x, name_repair = tolower, show_col_types = FALSE) %>%
mutate(filename = .x))
# showing results
many_columns_data_frame %>%
glimpse()## Rows: 200
## Columns: 12
## $ trans <chr> "manual(m5)", "manual(m5)", "manual(m6)", "auto(l5)", "ma…
## $ model <chr> "impreza awd", "tiburon", "a4 quattro", "a4", "ram 1500 p…
## $ year <dbl> 1999, 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 200…
## $ fl <chr> "r", "r", "p", "p", "e", "r", "r", "r", "r", "r", "r", "p…
## $ displ <dbl> 2.2, 2.0, 2.0, 2.8, 4.7, 3.8, 2.5, 3.8, 1.6, 4.2, 2.2, 1.…
## $ class <chr> "subcompact", "subcompact", "compact", "compact", "pickup…
## $ cyl <dbl> 4, 4, 4, 6, 8, 6, 5, 6, 4, 8, 4, 4, 8, 4, 6, 8, 5, 8, 4, …
## $ filename <fs::path> "input/unclean_files/1.csv", "input/unclean_files/1.…
## $ drv <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hwy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ cty <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ manufacturer <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# files not in the same folder
mpg_samples3 <- map(1:40, ~ slice_sample(mpg, n = 20))
# Create directories
# dir_create(c("input/nested_folders", "input/nested_folders/first_nested_folder", "input/nested_folders/second_nested_folder"))
# First folder
# iwalk(mpg_samples[1:20], ~ write_csv(.x, paste0("input/nested_folders/first_nested_folder/", .y, "_first.csv")))
# Second folder
# iwalk(mpg_samples[21:40], ~ write_csv(.x, paste0("input/nested_folders/second_nested_folder/", .y, "_second.csv")))
# searching through nested folders recursively
(csv_files_nested <- dir_ls("input/nested_folders/", glob = "*.csv", type = "file", recurse = TRUE))## input/nested_folders/first_nested_folder/10_first.csv
## input/nested_folders/first_nested_folder/11_first.csv
## input/nested_folders/first_nested_folder/12_first.csv
## input/nested_folders/first_nested_folder/13_first.csv
## input/nested_folders/first_nested_folder/14_first.csv
## input/nested_folders/first_nested_folder/15_first.csv
## input/nested_folders/first_nested_folder/16_first.csv
## input/nested_folders/first_nested_folder/17_first.csv
## input/nested_folders/first_nested_folder/18_first.csv
## input/nested_folders/first_nested_folder/19_first.csv
## input/nested_folders/first_nested_folder/1_first.csv
## input/nested_folders/first_nested_folder/20_first.csv
## input/nested_folders/first_nested_folder/2_first.csv
## input/nested_folders/first_nested_folder/3_first.csv
## input/nested_folders/first_nested_folder/4_first.csv
## input/nested_folders/first_nested_folder/5_first.csv
## input/nested_folders/first_nested_folder/6_first.csv
## input/nested_folders/first_nested_folder/7_first.csv
## input/nested_folders/first_nested_folder/8_first.csv
## input/nested_folders/first_nested_folder/9_first.csv
## input/nested_folders/second_nested_folder/10_second.csv
## input/nested_folders/second_nested_folder/11_second.csv
## input/nested_folders/second_nested_folder/12_second.csv
## input/nested_folders/second_nested_folder/13_second.csv
## input/nested_folders/second_nested_folder/14_second.csv
## input/nested_folders/second_nested_folder/15_second.csv
## input/nested_folders/second_nested_folder/16_second.csv
## input/nested_folders/second_nested_folder/17_second.csv
## input/nested_folders/second_nested_folder/18_second.csv
## input/nested_folders/second_nested_folder/19_second.csv
## input/nested_folders/second_nested_folder/1_second.csv
## input/nested_folders/second_nested_folder/20_second.csv
## input/nested_folders/second_nested_folder/2_second.csv
## input/nested_folders/second_nested_folder/3_second.csv
## input/nested_folders/second_nested_folder/4_second.csv
## input/nested_folders/second_nested_folder/5_second.csv
## input/nested_folders/second_nested_folder/6_second.csv
## input/nested_folders/second_nested_folder/7_second.csv
## input/nested_folders/second_nested_folder/8_second.csv
## input/nested_folders/second_nested_folder/9_second.csv
map_dfr(csv_files_nested, ~ read_csv(.x, show_col_types = FALSE) %>%
mutate(filename = .x)) %>%
glimpse()## Rows: 800
## Columns: 12
## $ manufacturer <chr> "toyota", "jeep", "honda", "volkswagen", "toyota", "subar…
## $ model <chr> "camry", "grand cherokee 4wd", "civic", "new beetle", "co…
## $ displ <dbl> 2.4, 4.7, 1.6, 2.5, 1.8, 2.5, 2.0, 5.7, 4.0, 5.9, 4.0, 2.…
## $ year <dbl> 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 1999, 199…
## $ cyl <dbl> 4, 8, 4, 5, 4, 4, 4, 8, 6, 8, 6, 4, 8, 8, 8, 4, 8, 6, 4, …
## $ trans <chr> "manual(m5)", "auto(l5)", "manual(m5)", "manual(m5)", "ma…
## $ drv <chr> "f", "4", "f", "f", "f", "4", "f", "r", "4", "4", "f", "4…
## $ cty <dbl> 21, 9, 23, 20, 26, 20, 19, 13, 14, 11, 16, 20, 11, 12, 9,…
## $ hwy <dbl> 31, 12, 29, 28, 35, 27, 26, 17, 17, 15, 23, 27, 15, 16, 1…
## $ fl <chr> "r", "e", "p", "r", "r", "r", "r", "r", "r", "r", "r", "r…
## $ class <chr> "midsize", "suv", "subcompact", "subcompact", "compact", …
## $ filename <fs::path> "input/nested_folders/first_nested_folder/10_first.c…
# selecting the files to import from a string pattern
csv_files_nested[str_detect(csv_files_nested, pattern = "[2-4]_first|second\\.csv$", negate = TRUE)] %>%
map_dfr(~ read_csv(.x, show_col_types = FALSE) %>%
mutate(filename = .x)) %>%
glimpse()## Rows: 280
## Columns: 12
## $ manufacturer <chr> "toyota", "jeep", "honda", "volkswagen", "toyota", "subar…
## $ model <chr> "camry", "grand cherokee 4wd", "civic", "new beetle", "co…
## $ displ <dbl> 2.4, 4.7, 1.6, 2.5, 1.8, 2.5, 2.0, 5.7, 4.0, 5.9, 4.0, 2.…
## $ year <dbl> 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 1999, 199…
## $ cyl <dbl> 4, 8, 4, 5, 4, 4, 4, 8, 6, 8, 6, 4, 8, 8, 8, 4, 8, 6, 4, …
## $ trans <chr> "manual(m5)", "auto(l5)", "manual(m5)", "manual(m5)", "ma…
## $ drv <chr> "f", "4", "f", "f", "f", "4", "f", "r", "4", "4", "f", "4…
## $ cty <dbl> 21, 9, 23, 20, 26, 20, 19, 13, 14, 11, 16, 20, 11, 12, 9,…
## $ hwy <dbl> 31, 12, 29, 28, 35, 27, 26, 17, 17, 15, 23, 27, 15, 16, 1…
## $ fl <chr> "r", "e", "p", "r", "r", "r", "r", "r", "r", "r", "r", "r…
## $ class <chr> "midsize", "suv", "subcompact", "subcompact", "compact", …
## $ filename <fs::path> "input/nested_folders/first_nested_folder/10_first.c…
The write_csv function writes tabular data to an ASCII
file in CSV format. Each row of data creates one line in the file, with
data items separated by commas (,).
# write_csv(horas_sol, "output/horas_sol.csv")“The aim of rio is to make data file I/O
[import/output] in R as easy as possible by implementing three simple
functions in Swiss-army knife style,” according to the project’s GitHub
page. Those functions are import(), export(),
and convert(). So, the rio package has just one function to
read in many different types of files: import(). Once
you’ve analyzed your data, if you want to save the results as a CSV,
Excel spreadsheet, or other format, rio’s export() function
can handle that. You can use R’s download.file function
with the syntax
download.file("url", "destinationFileName.csv") to download
files directly from the web. It’s possible rio will ask you to
re-download the file in binary format, in which case you’ll need to run
download.file(“http://bit.ly/BostonSnowfallCSV”, “BostonWinterSnowfalls.csv”, mode=‘wb").
# getting data from the web with R-built-in
download.file("http://bit.ly/BostonSnowfallCSV", "input/BostonWinterSnowfalls.csv")
# download.file(“http://bit.ly/BostonSnowfallCSV”, “BostonWinterSnowfalls.csv”, mode=‘wb")
# import data with rio locally
snowdata2 <- rio::import("input/BostonWinterSnowfalls.csv")
suicides <- rio::import("input/PDT-suicidesData.csv")
# rio::import("mySpreadsheet.xlsx", which = 2, col_names = c("City", "State", "Population"))If you want to download and import a file from the Web, you can do so
if it’s publicly available and in a format such as Excel or CSV you can
use rio. A lot of systems will be able to follow the
redirect URL to the file even after first giving you an error message,
as long as you specify the format as “csv” since the file name here
doesn’t include “.csv”. rio can also import well-formatted HTML tables
from Web pages, but the tables have to be extremely well-formatted. In
real life, though, Web data rarely appears in such neat, isolated form.
A good option for cases that aren’t quite as well crafted is often the
htmltab package. Since it wasn’t specified which table,
it pulled the first HTML table on the page. To download a specific table
use the which argument.
The most popular way to install packages from GitHub is to use a
package called devtools. devtools is an extremely
powerful package designed mostly for people who want to write their own
packages, and it includes a few ways to install packages from other
places besides CRAN. However, devtools usually requires a couple of
extra steps to install compared to a typical package. However, the
pacman package will also install packages from non-CRAN
sources like GitHub. You can use the number_with_commas()
function to change those character strings that should be numbers back
into numbers. The rmiscutils package isn’t the only way
to deal with imported numbers that have commas, the tidyverse readr
package also includes a function that turns character strings into
numbers, parse_number(). One advantage of
readr::parse_number() is that you can define your own
locale() to control things like encoding and decimal marks.
There’s an R package called janitor that can
automatically fix troublesome column names imported from a
non-R-friendly data source. You can create new clean column names using
janitor’s clean_names() function.
# getting data from the web with rio
snowdata3 <- rio::import("http://bit.ly/BostonSnowfallCSV", format = "csv")
# getting html tables
design.tokens1 <- rio::import("https://designsystem.digital.gov/design-tokens/", format = "html")
citytable <- htmltab("https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population", which = 5)
design.tokens2 <- htmltab("https://designsystem.digital.gov/design-tokens/", which = 6)
# installing packages from GitHub with pacman
pacman::p_load_gh("smach/rmiscutils")
# changing those character strings that should be numbers back into numbers
citytable$PopEst2021 <- number_with_commas(citytable$`2021estimate`)
citytable$Census2020 <- readr::parse_number(citytable$`2020census`)
# cleaning cols names
citytable_cleaned <- janitor::clean_names(citytable)
names(citytable_cleaned)## [1] "x2021rank" "city"
## [3] "state" "x2021estimate"
## [5] "x2020census" "change"
## [7] "x2020_land_area" "x2020_land_area_2"
## [9] "x2020_population_density" "x2020_population_density_2"
## [11] "location" "pop_est2021"
## [13] "census2020"
If you are interested in state or local government data in the US or Canada, you may want to check out RSocrata to see if an agency you’re interested in posts data there. I’ve yet to find a complete list of all available Socrata data sets, but there’s a search page at https://www.opendatanetwork.com.
It’s easy to add a column to a data frame. The name of the new column
is on the left, and there’s a formula on the right. Some of these
special dataframe functions (technically called “methods”) not only give
you information, but let you change characteristics of the data frame.
So, names(snowdata) tells you the column names in the data
frame but it will change the column names in the data frame.
# adding cols
snowdata$Meters <- snowdata$Total * 0.0254
# changing col names
names(snowdata) <- c("Winter", "SnowInches", "SnowMeters")
# changing from num to chr
download.file("https://raw.githubusercontent.com/smach/R4JournalismBook/master/data/bostonzips.txt", "input/bostonzips.txt")
zips <- rio::import("input/bostonzips.txt", colClasses = c("character", "character"))
# or
# zips <- rio::import("input/bostonzips.txt", colClasses = rep("character", 2))
# rep("character", 2) is the same as c(“character”, “character”), so colClasses = rep("character", 2) is equivalent to colClasses = c("character", "character")Often after you’ve wrangled your data in R, you’ll want to save your results. Here are some of the ways to export your data:
Save to a CSV file with
rio::export(myObjectName, file="myFileName.csv") and to an
Excel file with
rio::export(myObjectName, file="myFileName.xlsx"). rio
understands what file format you want based on the extension of the file
name. There are several other available formats, including .tsv for
tab-separated data, .json for JSON and .xml for XML.
Save to an R binary object that makes it easy to load back into R in future sessions. There are two options.
Generic save() will save one or more objects into a
file, such as
save(objectName1, objectName2, file="myfilename.RData"). To
read this data back into R, you just use the command
load("myfilename.RData") and all the objects return with
the same names in the same state they had before.
You can also save a single object into a file with
saveRDS(myobject, file="filename.rds"). The logical
assumption would be that loadRDS would read the file back in, but
instead the command is readRDS – and in this case, just the data has
been stored, not the object name. So, you need to read the data into a
new object name, such as
mydata <- readRDS("filename.rds").
You can also export an R object into your Windows or Mac clipboard
with rio: rio::export(myObjectName, format = "clipboard").
And, you can import data into R from your clipboard the same way:
rio::import(file = "clipboard").
rio’s convert() function lets you convert one file type
to another without having to manually pull the data into and then out of
R.
The openxlsx package makes writing to Excel files relatively easy. While there are lots of options in openxlsx, a typical pattern is to specify an Excel filename and a sheet name.
# write.xlsx(horas_sol, sheetName = "horas_sol", file = "output/horas_sol.xlsx")While rio is a great Swiss Army knife of file handling, there may be times when you want a bit more control over how your data is pulled into or saved out of R. In addition, there have been times when I’ve had a challenging data file that rio choked on but another package could handle. Some other functions and packages you may want to explore:
Base R’s read.csv() and read.table() to
import text files (use ?read.csv and ?read.table to get more
information). stringsAsFactors = FALSE is needed with these
if you want to keep your character strings as character strings.
write.csv() will save to CSV.
Wickham’s readr package is also worth a look as part of the “tidyverse.” readr includes functions to read CSV, tab-separated, fixed-width, Web logs, and several other types of files. readr prints out the type of data it has determined for each column – integer, character, double (non-whole numbers), etc. It creates tibbles.
The googlesheets package lets you import data
from a Google Sheet, even if it’s private, by authenticating your Google
account. The package is available on CRAN; install it with with
install.packages("googlesheets").
If you are working with large data sets, speed may become important
to you when saving and loading files. The data.table
package has a speedy fread() function, but beware that
resulting objects are data.tables and not plain data frames; some
behaviors are different. If you want a conventional data frame, you can
get one with the as.data.frame(mydatatable) syntax.
fwrite() function is aimed at writing to a CSV file
considerably faster than base R’s write.csv().
The feather package saves in a binary format that
can be read either into R or Python. And, the fst
package’s read.fst() and write.fst() offer
fast saving and loading of R data frame objects – plus the option of
file compression.
Often the values required for a particular operation can be found in
a data frame, but they are not organized in the appropriate way. As a
simple example, data for multiple groups are often stored in
spreadsheets or data summaries as columns, with a separate column for
each group. Most of the modeling and graphics functions in R will not be
able to work with such data; they expect the values to be in a single
column with an additional column that specifies the group from which the
data arose. The stack function can reorganize datasets to
have this property. If there were other variables in the data frame that
did not need to be converted to this form, the select=
argument to stack allows you to specify the variables that should be
used, similar to the same argument to the subset function.
The unstack function will reorganize stacked data back to
the one column per group form. To use unstack, a formula
must be provided to explain the roles of the variables to be
unstacked.
For more complex reorganizations, the concept of “wide” versus “long”
datasets is often helpful. When there are multiple occurrences of values
for a single observation, a data frame is said to be long if each
occurrence is a separate row in the data frame; if all of the
occurrences of values for a given observation are in the same row, then
the dataset is said to be wide. The reshape function
converts datasets between these two forms. Perhaps the most common use
of reshape involves repeated measures analyses, where the same variable
is recorded for each observation at several different times.
To use reshape to convert the dataset to wide format, we need to provide five arguments. The first argument is the data frame to be reshaped. The next three arguments provide the names of the columns that will be involved in the reshaping. The idvar= argument provides the names of the variables that define the experimental unit which was repeatedly measured. In this case, it”s the subj variable. The v.names= argument tells reshape which variables in the long format will be used to create the multiple variables in the wide format. In this example, we want both x and y be to be expanded to multiple variables, so we”d specify a vector with both those names. The timevar= variable tells which variable identifies the sequence number that will be used to create the multiple versions of the v.names variables; in this case it will be time. Finally, the direction= argument accepts values of “wide” or “long”, depending on which transformation is to be performed.
The names x.1, y.1, etc. were formed by joining together the variable
names of the variables specified in the v.names= argument
with the values of the timevar= variable. Any variables not
specified in the v.names= argument are assumed to be
constant for all observations with the same values as the
idvar= variables, and a single copy of such variables will
be included in the output data frame. Only the variables whose names
appear in the v.names= argument will be converted into
multiple variables, so if any variables that are in the data frame but
not in the v.names= argument are not constant, reshape will
print a warning message, and use the first value of such variables when
converting to wide format. To prevent variables from being transferred
to the output data frame, the drop= argument can be used to pass a
vector of variable names to be ignored in the conversion.
The information about the reshaping procedure is stored as attributes
in converted data frames, so once a data frame has been converted with
reshape, it can be changed to its previous format by passing just the
data frame with no additional arguments to reshape.
Since reshape can handle multiple sets of variables, the
varying= argument should be passed a list containing
vectors with the names of the different sets of variables that should be
mapped to a single variable in the long dataset. The automatically
generated variable id is simply a numeric index corresponding to the
type variable; using idvar="type" will suppress its
creation. The automatically generated variable time defaults to a set of
consecutive integers; providing more meaningful values through the
times= argument will label the values properly. Finally,
the name of the column representing the values (which defaults to the
first name in the varying= argument) can be set to a more meaningful
name with the v.names= argument.
The reshape package uses the concept of “melting” a dataset (through the melt function) into a data frame which contains separate columns for each id variable, a variable column containing the name of each measured variable, and a final column named value with the variable”s value. It may be noticed that this melting operation is essentially a “wide-to-long” reshaping of the data.
For long-to-wide conversions, recall that variables appearing to the left of the tilde in the formula passed to cast will appear in the columns of the output, while those on the right will appear in the rows.
At the most basic level, two or more data frames can be combined by
rows using rbind, or by columns using cbind.
For rbind, the data frames must have the same number of
columns; for cbind, the data frames must have the same
number of rows. Vectors or matrices passed to cbind will be
converted to data frames, so the mode of columns passed to
cbind will be preserved. While cbind will
demand that data frames and matrices are conformable (that is, they have
the same number of rows), vectors passed to cbind will be recycled if
the number of rows in the data frame or matrix is an even multiple of
the length of the vector. It may be a good idea to use unique names when
combining data frames in this way. An easy way to test is to pass the
names of the two data frames to the intersect function.
When using rbind, the names and classes of values to be
joined must match, or a variety of errors may occur.
Although the rbind function will demand that the names
of the objects being combined agree, cbind does not do any
such checking. To combine data frames based on the values of common
variables, the merge function should be used. This function
is designed to provide the same sort of functionality and behavior as
the table joins provided by relational databases. Although merge is
limited to operating on two data frames at a time, it can be called
repeatedly to deal with more than two data frames. The default behavior
of merge is to join together rows of the data frames based
on the values of all of the variables (columns) that the data frames
have in common. (In database terminology, this is known as a natural
join.) When called without any other arguments, merge returns only those
rows which had observations in both data frames.
Although there were six unique values for a between the two data
frames, only those rows with values of a in both data frames are
represented in the output. To modify this, the all=,
all.x=, and all.y= arguments can be used.
Specifying all=TRUE will include all rows (full outer join,
in database terminology), all.x=TRUE will include all rows
from the first data frame (left outer join), and all.y=TRUE
does the same for the second data frame (right outer join).
To take more control over which variables are used to merge rows of
the data frame, the by= argument can be used. You provide
the by= argument with a vector of the name or names of the
variables that should be used for the merge. If the merging variables
have different names in the data frames to be merged, the
by.x= and by.y= arguments can be used.
mydata <- data.frame(grp1 = c(12, 15, 19, 22, 25), grp2 = c(18, 12, 42, 29, 44), grp3 = c(8, 17, 22, 19, 31))
# reshaping data frame
sdata <- stack(mydata)
sdata## values ind
## 1 12 grp1
## 2 15 grp1
## 3 19 grp1
## 4 22 grp1
## 5 25 grp1
## 6 18 grp2
## 7 12 grp2
## 8 42 grp2
## 9 29 grp2
## 10 44 grp2
## 11 8 grp3
## 12 17 grp3
## 13 22 grp3
## 14 19 grp3
## 15 31 grp3
# converting the the original form
mydata <- unstack(sdata, values ~ ind)
mydata## grp1 grp2 grp3
## 1 12 18 8
## 2 15 12 17
## 3 19 42 22
## 4 22 29 19
## 5 25 44 31
# using reshape
set.seed(17)
obs <- data.frame(subj = rep(1:4, rep(3, 4)), time = rep(1:3), x = rnorm(12), y = rnorm(12))
head(obs)## subj time x y
## 1 1 1 -1.01500872 1.29532187
## 2 1 2 -0.07963674 0.18791807
## 3 1 3 -0.23298702 1.59120510
## 4 2 1 -0.81726793 -0.05517906
## 5 2 2 0.77209084 0.83847112
## 6 2 3 -0.16561194 0.15937013
# from long to wide
wideobs <- reshape(obs, idvar = "subj", v.names = c("x", "y"), timevar = "time", direction = "wide")
head(wideobs)## subj x.1 y.1 x.2 y.2 x.3 y.3
## 1 1 -1.0150087 1.29532187 -0.07963674 0.1879181 -0.2329870 1.5912051
## 4 2 -0.8172679 -0.05517906 0.77209084 0.8384711 -0.1656119 0.1593701
## 7 3 0.9728744 0.62595440 1.71653398 0.6335847 0.2552370 0.6810276
## 10 4 0.3665811 -0.68203337 1.18078924 -0.7232567 0.6431921 1.6735260
# from wide to long
obs <- reshape(wideobs)
head(obs)## subj time x y
## 1.1 1 1 -1.01500872 1.29532187
## 2.1 2 1 -0.81726793 -0.05517906
## 3.1 3 1 0.97287443 0.62595440
## 4.1 4 1 0.36658112 -0.68203337
## 1.2 1 2 -0.07963674 0.18791807
## 2.2 2 2 0.77209084 0.83847112
# from wide to long (complex example)
usp <- data.frame(type = rownames(USPersonalExpenditure), USPersonalExpenditure, row.names = NULL)
head(usp)## type X1940 X1945 X1950 X1955 X1960
## 1 Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## 2 Household Operation 10.500 15.500 29.00 36.5 46.20
## 3 Medical and Health 3.530 5.760 9.71 14.0 21.10
## 4 Personal Care 1.040 1.980 2.45 3.4 5.40
## 5 Private Education 0.341 0.974 1.80 2.6 3.64
rr <- reshape(usp, varying = list(names(usp)[-1]), idvar = "type", times = seq(1940, 1960, by = 5), v.names = "expend", direction = "long")
head(rr)## type time expend
## Food and Tobacco.1940 Food and Tobacco 1940 22.200
## Household Operation.1940 Household Operation 1940 10.500
## Medical and Health.1940 Medical and Health 1940 3.530
## Personal Care.1940 Personal Care 1940 1.040
## Private Education.1940 Private Education 1940 0.341
## Food and Tobacco.1945 Food and Tobacco 1945 44.500
# an alternative way of reshaping the usp data frame, without having to explicitly provide the values of the times
rr1 <- reshape(usp, varying = names(usp)[-1], idvar = "type", split = list(regexp = "X1", include = TRUE), direction = "long")
head(rr1)## type time X
## Food and Tobacco.1940 Food and Tobacco 1940 22.200
## Household Operation.1940 Household Operation 1940 10.500
## Medical and Health.1940 Medical and Health 1940 3.530
## Personal Care.1940 Personal Care 1940 1.040
## Private Education.1940 Private Education 1940 0.341
## Food and Tobacco.1945 Food and Tobacco 1945 44.500
# using melt from reshape package (wide-to-long)
musp = reshape::melt(usp)
# or
# reshape::cast(musp,variable + type ~ .)
head(musp)## type variable value
## 1 Food and Tobacco X1940 22.200
## 2 Household Operation X1940 10.500
## 3 Medical and Health X1940 3.530
## 4 Personal Care X1940 1.040
## 5 Private Education X1940 0.341
## 6 Food and Tobacco X1945 44.500
# getting rid of "X" and changing type to numeric
musp$variable <- as.numeric(sub("X", "", musp$variable))
# renaming columns
names(musp)[2:3] <- c("time", "expend")
head(musp)## type time expend
## 1 Food and Tobacco 1940 22.200
## 2 Household Operation 1940 10.500
## 3 Medical and Health 1940 3.530
## 4 Personal Care 1940 1.040
## 5 Private Education 1940 0.341
## 6 Food and Tobacco 1945 44.500
# using cast from reshape package (long-to-wide)
set.seed(999)
obs2 <- data.frame(subj = rep(1:4, rep(3, 4)), time = rep(1:3), x = rnorm(12), y = rnorm(12))
mobs <- reshape::melt(obs2)
# reshape::cast(subj ~ variable + time, data = mobs)
# combining data sets using cbind and rbind
x <- data.frame(a = c("A", "B", "C"), x = c(12, 15, 19))
y <- data.frame(a = c("D", "E", "F", "G"), x = c(19, 21, 14, 12))
intersect(names(x), names(y))## [1] "a" "x"
cbind(y, z = c(1, 2))## a x z
## 1 D 19 1
## 2 E 21 2
## 3 F 14 1
## 4 G 12 2
# combining data sets based on the values of common variables
x <- data.frame(a = c(1, 2, 4, 5, 6), x = c(9, 12, 14, 21, 8))
y <- data.frame(a = c(1, 3, 4, 6), y = c(8, 14, 19, 2))
merge(x, y)## a x y
## 1 1 9 8
## 2 4 14 19
## 3 6 8 2
# outer join
merge(x, y, all = TRUE)## a x y
## 1 1 9 8
## 2 2 12 NA
## 3 3 NA 14
## 4 4 14 19
## 5 5 21 NA
## 6 6 8 2
# left outer join
merge(x, y, all.x = TRUE)## a x y
## 1 1 9 8
## 2 2 12 NA
## 3 4 14 19
## 4 5 21 NA
## 5 6 8 2
# right outer join
merge(x, y, all.y = TRUE)## a x y
## 1 1 9 8
## 2 3 NA 14
## 3 4 14 19
## 4 6 8 2
cities <- data.frame(city = c("New York", "Boston", "Juneau", "Anchorage", "San Diego", "Philadelphia", "Los Angeles", "Fairbanks", "Ann Arbor", "Seattle"), state.abb = c("NY", "MA", "AK", "AK", "CA", "PA", "CA", "AK", "MI", "WA"))
states <- data.frame(state.abb = c("NY", "MA", "AK", "CA", "PA", "MI", "WA"), state = c("New York", "Massachusetts", "Alaska", "California", "Pennsylvania", "Michigan", "Washington"))
merge(cities, states)## state.abb city state
## 1 AK Juneau Alaska
## 2 AK Anchorage Alaska
## 3 AK Fairbanks Alaska
## 4 CA San Diego California
## 5 CA Los Angeles California
## 6 MA Boston Massachusetts
## 7 MI Ann Arbor Michigan
## 8 NY New York New York
## 9 PA Philadelphia Pennsylvania
## 10 WA Seattle Washington
# another example
# mayordata <- merge(contributions_split, results_split, all.x = TRUE, all.y = TRUE, by.x = "LastName", by.y = "LastName")There are three interrelated rules which make a dataset tidy: 1. Each variable must have its own column. 2. Each observation must have its own row. 3. Each value must have its own cell. These three rules are interrelated because it”s impossible to only satisfy two of the three: 1. Put each dataset in a tibble. 2. Put each variable in a column.
Tidy data set
pivot_longer() “lengthens” data, increasing the number
of rows and decreasing the number of columns. The inverse transformation
is pivot_wider(). pivot_longer() is an updated
approach to gather(), designed to be both simpler to use
and to handle more use cases. We recommend you use
pivot_longer() for new code; gather() isn’t
going away but is no longer under active development.
pivot_wider() “widens” data, increasing the number of
columns and decreasing the number of rows. The inverse transformation is
pivot_longer(). pivot_wider() is an updated
approach to spread(), designed to be both simpler to use
and to handle more use cases. We recommend you use
pivot_wider() for new code; spread() isn’t
going away but is no longer under active development.
# wide to long
# where column names are character data
head(relig_income)## # A tibble: 6 × 11
## religion `<$10k` $10-2…¹ $20-3…² $30-4…³ $40-5…⁴ $50-7…⁵ $75-1…⁶ $100-…⁷
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agnostic 27 34 60 81 76 137 122 109
## 2 Atheist 12 27 37 52 35 70 73 59
## 3 Buddhist 27 21 30 34 33 58 62 39
## 4 Catholic 418 617 732 670 638 1116 949 792
## 5 Don’t know/re… 15 14 15 11 10 35 21 17
## 6 Evangelical P… 575 869 1064 982 881 1486 949 723
## # … with 2 more variables: `>150k` <dbl>, `Don't know/refused` <dbl>, and
## # abbreviated variable names ¹`$10-20k`, ²`$20-30k`, ³`$30-40k`, ⁴`$40-50k`,
## # ⁵`$50-75k`, ⁶`$75-100k`, ⁷`$100-150k`
relig_income %>%
pivot_longer(!religion, names_to = "income", values_to = "count")## # A tibble: 180 × 3
## religion income count
## <chr> <chr> <dbl>
## 1 Agnostic <$10k 27
## 2 Agnostic $10-20k 34
## 3 Agnostic $20-30k 60
## 4 Agnostic $30-40k 81
## 5 Agnostic $40-50k 76
## 6 Agnostic $50-75k 137
## 7 Agnostic $75-100k 122
## 8 Agnostic $100-150k 109
## 9 Agnostic >150k 84
## 10 Agnostic Don't know/refused 96
## # … with 170 more rows
# columns have common prefix and missing missings are structural so should be dropped
head(billboard)## # A tibble: 6 × 79
## artist track date.ent…¹ wk1 wk2 wk3 wk4 wk5 wk6 wk7 wk8 wk9
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac Baby… 2000-02-26 87 82 72 77 87 94 99 NA NA
## 2 2Ge+her The … 2000-09-02 91 87 92 NA NA NA NA NA NA
## 3 3 Door… Kryp… 2000-04-08 81 70 68 67 66 57 54 53 51
## 4 3 Door… Loser 2000-10-21 76 76 72 69 67 65 55 59 62
## 5 504 Bo… Wobb… 2000-04-15 57 34 25 17 17 31 36 49 53
## 6 98^0 Give… 2000-08-19 51 39 34 26 26 19 2 2 3
## # … with 67 more variables: wk10 <dbl>, wk11 <dbl>, wk12 <dbl>, wk13 <dbl>,
## # wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>, wk19 <dbl>,
## # wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>, wk25 <dbl>,
## # wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>, wk31 <dbl>,
## # wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>, wk37 <dbl>,
## # wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, wk43 <dbl>,
## # wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, …
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_prefix = "wk",
values_to = "rank",
values_drop_na = TRUE
)## # A tibble: 5,307 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 3 92
## # … with 5,297 more rows
# multiple variables stored in column names
head(tidyr::who)## # A tibble: 6 × 60
## country iso2 iso3 year new_sp…¹ new_s…² new_s…³ new_s…⁴ new_s…⁵ new_s…⁶
## <chr> <chr> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Afghanistan AF AFG 1980 NA NA NA NA NA NA
## 2 Afghanistan AF AFG 1981 NA NA NA NA NA NA
## 3 Afghanistan AF AFG 1982 NA NA NA NA NA NA
## 4 Afghanistan AF AFG 1983 NA NA NA NA NA NA
## 5 Afghanistan AF AFG 1984 NA NA NA NA NA NA
## 6 Afghanistan AF AFG 1985 NA NA NA NA NA NA
## # … with 50 more variables: new_sp_m65 <int>, new_sp_f014 <int>,
## # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, …
tidyr::who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
)## # A tibble: 405,440 × 8
## country iso2 iso3 year diagnosis gender age count
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan AF AFG 1980 sp m 014 NA
## 2 Afghanistan AF AFG 1980 sp m 1524 NA
## 3 Afghanistan AF AFG 1980 sp m 2534 NA
## 4 Afghanistan AF AFG 1980 sp m 3544 NA
## 5 Afghanistan AF AFG 1980 sp m 4554 NA
## 6 Afghanistan AF AFG 1980 sp m 5564 NA
## 7 Afghanistan AF AFG 1980 sp m 65 NA
## 8 Afghanistan AF AFG 1980 sp f 014 NA
## 9 Afghanistan AF AFG 1980 sp f 1524 NA
## 10 Afghanistan AF AFG 1980 sp f 2534 NA
## # … with 405,430 more rows
# multiple observations per row
head(anscombe)## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
anscombe %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)"
)## # A tibble: 44 × 3
## set x y
## <chr> <dbl> <dbl>
## 1 1 10 8.04
## 2 2 10 9.14
## 3 3 10 7.46
## 4 4 8 6.58
## 5 1 8 6.95
## 6 2 8 8.14
## 7 3 8 6.77
## 8 4 8 5.76
## 9 1 13 7.58
## 10 2 13 8.74
## # … with 34 more rows
# long to wide
head(fish_encounters)## # A tibble: 6 × 3
## fish station seen
## <fct> <fct> <int>
## 1 4842 Release 1
## 2 4842 I80_1 1
## 3 4842 Lisbon 1
## 4 4842 Rstr 1
## 5 4842 Base_TD 1
## 6 4842 BCE 1
fish_encounters %>%
pivot_wider(names_from = station, values_from = seen)## # A tibble: 19 × 12
## fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
## <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 4842 1 1 1 1 1 1 1 1 1 1 1
## 2 4843 1 1 1 1 1 1 1 1 1 1 1
## 3 4844 1 1 1 1 1 1 1 1 1 1 1
## 4 4845 1 1 1 1 1 NA NA NA NA NA NA
## 5 4847 1 1 1 NA NA NA NA NA NA NA NA
## 6 4848 1 1 1 1 NA NA NA NA NA NA NA
## 7 4849 1 1 NA NA NA NA NA NA NA NA NA
## 8 4850 1 1 NA 1 1 1 1 NA NA NA NA
## 9 4851 1 1 NA NA NA NA NA NA NA NA NA
## 10 4854 1 1 NA NA NA NA NA NA NA NA NA
## 11 4855 1 1 1 1 1 NA NA NA NA NA NA
## 12 4857 1 1 1 1 1 1 1 1 1 NA NA
## 13 4858 1 1 1 1 1 1 1 1 1 1 1
## 14 4859 1 1 1 1 1 NA NA NA NA NA NA
## 15 4861 1 1 1 1 1 1 1 1 1 1 1
## 16 4862 1 1 1 1 1 1 1 1 1 NA NA
## 17 4863 1 1 NA NA NA NA NA NA NA NA NA
## 18 4864 1 1 NA NA NA NA NA NA NA NA NA
## 19 4865 1 1 1 NA NA NA NA NA NA NA NA
# filling in missing values
fish_encounters %>%
pivot_wider(names_from = station, values_from = seen, values_fill = 0)## # A tibble: 19 × 12
## fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
## <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 4842 1 1 1 1 1 1 1 1 1 1 1
## 2 4843 1 1 1 1 1 1 1 1 1 1 1
## 3 4844 1 1 1 1 1 1 1 1 1 1 1
## 4 4845 1 1 1 1 1 0 0 0 0 0 0
## 5 4847 1 1 1 0 0 0 0 0 0 0 0
## 6 4848 1 1 1 1 0 0 0 0 0 0 0
## 7 4849 1 1 0 0 0 0 0 0 0 0 0
## 8 4850 1 1 0 1 1 1 1 0 0 0 0
## 9 4851 1 1 0 0 0 0 0 0 0 0 0
## 10 4854 1 1 0 0 0 0 0 0 0 0 0
## 11 4855 1 1 1 1 1 0 0 0 0 0 0
## 12 4857 1 1 1 1 1 1 1 1 1 0 0
## 13 4858 1 1 1 1 1 1 1 1 1 1 1
## 14 4859 1 1 1 1 1 0 0 0 0 0 0
## 15 4861 1 1 1 1 1 1 1 1 1 1 1
## 16 4862 1 1 1 1 1 1 1 1 1 0 0
## 17 4863 1 1 0 0 0 0 0 0 0 0 0
## 18 4864 1 1 0 0 0 0 0 0 0 0 0
## 19 4865 1 1 1 0 0 0 0 0 0 0 0
# generating column names from multiple variables
head(us_rent_income)## # A tibble: 6 × 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 01 Alabama income 24476 136
## 2 01 Alabama rent 747 3
## 3 02 Alaska income 32940 508
## 4 02 Alaska rent 1200 13
## 5 04 Arizona income 27517 148
## 6 04 Arizona rent 972 4
us_rent_income %>%
pivot_wider(names_from = variable, values_from = c(estimate, moe))## # A tibble: 52 × 6
## GEOID NAME estimate_income estimate_rent moe_income moe_rent
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01 Alabama 24476 747 136 3
## 2 02 Alaska 32940 1200 508 13
## 3 04 Arizona 27517 972 148 4
## 4 05 Arkansas 23789 709 165 5
## 5 06 California 29454 1358 109 3
## 6 08 Colorado 32401 1125 109 5
## 7 09 Connecticut 35326 1123 195 5
## 8 10 Delaware 31560 1076 247 10
## 9 11 District of Columbia 43198 1424 681 17
## 10 12 Florida 25952 1077 70 3
## # … with 42 more rows
# when there are multiple `names_from` or `values_from`, you can use use `names_sep` or `names_glue` to control the output variable names
us_rent_income %>%
pivot_wider(
names_from = variable,
names_sep = ".",
values_from = c(estimate, moe)
)## # A tibble: 52 × 6
## GEOID NAME estimate.income estimate.rent moe.income moe.rent
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01 Alabama 24476 747 136 3
## 2 02 Alaska 32940 1200 508 13
## 3 04 Arizona 27517 972 148 4
## 4 05 Arkansas 23789 709 165 5
## 5 06 California 29454 1358 109 3
## 6 08 Colorado 32401 1125 109 5
## 7 09 Connecticut 35326 1123 195 5
## 8 10 Delaware 31560 1076 247 10
## 9 11 District of Columbia 43198 1424 681 17
## 10 12 Florida 25952 1077 70 3
## # … with 42 more rows
us_rent_income %>%
pivot_wider(
names_from = variable,
names_glue = "{variable}_{.value}",
values_from = c(estimate, moe)
)## # A tibble: 52 × 6
## GEOID NAME income_estimate rent_estimate income_moe rent_moe
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01 Alabama 24476 747 136 3
## 2 02 Alaska 32940 1200 508 13
## 3 04 Arizona 27517 972 148 4
## 4 05 Arkansas 23789 709 165 5
## 5 06 California 29454 1358 109 3
## 6 08 Colorado 32401 1125 109 5
## 7 09 Connecticut 35326 1123 195 5
## 8 10 Delaware 31560 1076 247 10
## 9 11 District of Columbia 43198 1424 681 17
## 10 12 Florida 25952 1077 70 3
## # … with 42 more rows
warpbreaks <- as_tibble(warpbreaks[c("wool", "tension", "breaks")])
head(warpbreaks)## # A tibble: 6 × 3
## wool tension breaks
## <fct> <fct> <dbl>
## 1 A L 26
## 2 A L 30
## 3 A L 54
## 4 A L 25
## 5 A L 70
## 6 A L 52
warpbreaks %>%
pivot_wider(
names_from = wool,
values_from = breaks,
values_fn = mean
)## # A tibble: 3 × 3
## tension A B
## <fct> <dbl> <dbl>
## 1 L 44.6 28.2
## 2 M 24 28.8
## 3 H 24.6 18.8
Existen otras tres funciones que son muy útiles para la manipulación
de datos: separate(), que sirve para asegurar una columna
en varias nuevas; case_when(), que sirve para establecer
condionales y es similar a ifelse() y
complete() que sirve para completar una variable o una
combinación de variables.
Para mostrar la primera función convertimos la fecha en tres
columnas: año (yr), mes (mo) y día (dy), cuyos elementos están separados
por “-“. Por ello, es necesario indicar la columna afectada, los nombres
de las nuevas columnas y el símbolo separador. La función
mutate_all() aplica a todas las columnas otra función, en
este caso, as.numeric() para convertir todas en
numéricas.
En lugar de encapsular y encadenar ifelse(), podemos
usar la función case_when(), en la que empleamos fórmulas
en dos tiempos: por un lado la condición; por otro, la acción cuando se
cumpla esa condición.
Nuestro conjunto de datos contiene datos diarios desde el 1 de enero
de 1980 hasta el 31 de diciembre de 2015. Así que únicamente debemos
crear un vector con fechas de este periodo. En la función
complete() indicamos la columna que queremos completar y le
asignamos el vector entero de fechas. El resultado es un nuevo
data.frame con todas las fechas, rellenando el resto de
columnas con NA.
The extract function is basically the separate function
with super powers and works with groups instead of separators. The
separate function allows you to split a character variable
into multiple variables. The key difference between
separate and extract is that
extract works with groups within its regular expressions.
Each captured group is converted into a new column. So instead of
thinking of the separator in separate with extract, we
think of groups. extract takes a few arguments:
To extract columns that are more complicated and confusing, we need
to learn the concept of non-grouping parentheses.
Non-grouping parentheses define groups that are not captured. In other
words, these groups are not converted into a new column. A non-grouping
parenthesis is defined by a group that starts with a question mark and a
colon: (?:). The advantage of this method is that we can
solve column separation problems caused by messy or inconsistent
variables.
tidyr’s separate() function will split a data frame
column into multiple columns based on a delimiter of your choice. The
syntax is
separate(my_df, my_col_name, my_new_col_names, my_delimiter, my_number_of_new_columns).
Regular Expressions
ta_vigo <- read_csv( "input/TG_STAID001395.txt", skip = 20)
head(ta_vigo)## # A tibble: 6 × 5
## STAID SOUID DATE TG Q_TG
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1395 20408 19560501 -9999 9
## 2 1395 20408 19560502 -9999 9
## 3 1395 20408 19560503 -9999 9
## 4 1395 20408 19560504 -9999 9
## 5 1395 20408 19560505 -9999 9
## 6 1395 20408 19560506 -9999 9
ta_vigo <- mutate(ta_vigo, DATE = lubridate::ymd(DATE), TG = ifelse(TG == -9999, NA, TG / 10)) %>%
filter(DATE >= "1980-01-01", DATE <= "2015-12-31") %>%
select(-STAID:-SOUID, -Q_TG) %>%
rename(date = DATE)
head(ta_vigo)## # A tibble: 6 × 2
## date TG
## <date> <dbl>
## 1 1980-01-01 11.2
## 2 1980-01-02 9.5
## 3 1980-01-03 7.5
## 4 1980-01-04 10.3
## 5 1980-01-05 7.8
## 6 1980-01-06 7.3
horas_sol <- read_csv("input/SS_STAID001395.txt", skip = 19)
horas_sol <- mutate(horas_sol, DATE = lubridate::ymd(DATE), SS = ifelse(SS == -9999, NA, SS / 10), month = month(DATE), year = year(DATE))
horas_sol <- rename(horas_sol, date = DATE, sunhours = SS)
horas_sol## # A tibble: 22,494 × 6
## SOUID date sunhours Q_SS month year
## <dbl> <date> <dbl> <dbl> <int> <int>
## 1 120414 1956-05-01 NA 9 5 1956
## 2 120414 1956-05-02 NA 9 5 1956
## 3 120414 1956-05-03 NA 9 5 1956
## 4 120414 1956-05-04 NA 9 5 1956
## 5 120414 1956-05-05 NA 9 5 1956
## 6 120414 1956-05-06 NA 9 5 1956
## 7 120414 1956-05-07 NA 9 5 1956
## 8 120414 1956-05-08 NA 9 5 1956
## 9 120414 1956-05-09 NA 9 5 1956
## 10 120414 1956-05-10 NA 9 5 1956
## # … with 22,484 more rows
data_vigo <- left_join (ta_vigo, horas_sol, by = "date" )
data_vigo## # A tibble: 13,149 × 7
## date TG SOUID sunhours Q_SS month year
## <date> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 1980-01-01 11.2 120414 0.3 0 1 1980
## 2 1980-01-02 9.5 120414 0.2 0 1 1980
## 3 1980-01-03 7.5 120414 1.2 0 1 1980
## 4 1980-01-04 10.3 120414 0.5 0 1 1980
## 5 1980-01-05 7.8 120414 3 0 1 1980
## 6 1980-01-06 7.3 120414 1.8 0 1 1980
## 7 1980-01-07 9.3 120414 0 0 1 1980
## 8 1980-01-08 9.8 120414 3.3 0 1 1980
## 9 1980-01-09 8.3 120414 0 0 1 1980
## 10 1980-01-10 6.5 120414 4.1 0 1 1980
## # … with 13,139 more rows
rio::export(data_vigo, file = "input/data_vigo.csv")
# using separate()
# data_vigo <- rio::import("input/data_vigo.csv")
time_df <- select(data_vigo, date) %>%
separate(date, c("yr", "mo", "dy"), sep = "-") %>%
mutate_all(as.numeric)
head(time_df)## # A tibble: 6 × 3
## yr mo dy
## <dbl> <dbl> <dbl>
## 1 1980 1 1
## 2 1980 1 2
## 3 1980 1 3
## 4 1980 1 4
## 5 1980 1 5
## 6 1980 1 6
# another example
contributions_split <- tidyr::separate(
contributions, Candidate,
c("LastName", "FirstName"), ", ", 2
) %>%
select(-FirstName)
head(contributions_split)## LastName Pct_Local_Contributors
## Horrigan 0.035820896
## Neves-Grigg 0.011940299
## Sen 0.008955224
## Sousa 0.029850746
## Spicer 0.516417910
## Stefanini 0.337313433
results_split <- tidyr::separate(results, Candidate, c("FirstName", "MiddleName", "LastName"), " ")
tail(results_split)## # A tibble: 6 × 4
## FirstName MiddleName LastName Pct_Vote
## <chr> <chr> <chr> <dbl>
## 1 John A. Stefanini 0.292
## 2 Dhruba P. Sen 0.00926
## 3 Mark S. Tilden 0.0402
## 4 Yvonne M. Spicer 0.547
## 5 Benjaman A. Neves-Grigg, 0.0123
## 6 Priscila Sousa <NA> 0.0493
results_split %<>%
mutate(
LastName = ifelse(is.na(LastName), MiddleName, LastName),
LastName = str_replace(LastName, ",", "")
) %>%
select(-FirstName, -MiddleName)
tail(results_split)## # A tibble: 6 × 2
## LastName Pct_Vote
## <chr> <dbl>
## 1 Stefanini 0.292
## 2 Sen 0.00926
## 3 Tilden 0.0402
## 4 Spicer 0.547
## 5 Neves-Grigg 0.0123
## 6 Sousa 0.0493
# using case_when()
time_df <- mutate(time_df, season = case_when(
mo %in% c(12, 1:2) ~ "invierno",
mo %in% 3:5 ~ "primavera",
mo %in% 6:8 ~ "verano",
mo %in% 9:11 ~ "otoño"
))
# preparing the data
data_vigo <- gather(data_vigo, Variable, Valor, TG:sunhours)
data_vigo_subset <- filter(data_vigo, Variable == "TG") %>%
slice(sample(nrow(data_vigo), 10000)) %>%
arrange(date)
head(data_vigo_subset)## # A tibble: 6 × 6
## date Q_SS month year Variable Valor
## <date> <dbl> <int> <int> <chr> <dbl>
## 1 1980-01-07 0 1 1980 TG 9.3
## 2 1980-01-11 0 1 1980 TG 7.1
## 3 1980-01-13 0 1 1980 TG 3.4
## 4 1980-01-17 0 1 1980 TG 4.5
## 5 1980-01-23 0 1 1980 TG 10.4
## 6 1980-01-27 0 1 1980 TG 12.7
date_ts <- seq(lubridate::ymd("1980-01-01"), lubridate::ymd("2015-12-31"), "day")
# using complete()
data_vigo_subset <- complete(data_vigo_subset, date = date_ts)
head(data_vigo_subset)## # A tibble: 6 × 6
## date Q_SS month year Variable Valor
## <date> <dbl> <int> <int> <chr> <dbl>
## 1 1980-01-01 NA NA NA <NA> NA
## 2 1980-01-02 NA NA NA <NA> NA
## 3 1980-01-03 NA NA NA <NA> NA
## 4 1980-01-04 NA NA NA <NA> NA
## 5 1980-01-05 NA NA NA <NA> NA
## 6 1980-01-06 NA NA NA <NA> NA
# using extract()
# example with separate()
tibble(
variable = c("a-b", "a-d", "b-c", "d-e")
) %>%
separate(
variable,
into = c("a", "b"),
sep = "-",
remove = FALSE
)## # A tibble: 4 × 3
## variable a b
## <chr> <chr> <chr>
## 1 a-b a b
## 2 a-d a d
## 3 b-c b c
## 4 d-e d e
# with extract
tibble(
variable = c("a-b", "a-d", "b-c", "d-e")
) %>%
extract(
col = variable,
into = c("a", "b"),
regex = "([a-z])-([a-z])",
remove = FALSE
)## # A tibble: 4 × 3
## variable a b
## <chr> <chr> <chr>
## 1 a-b a b
## 2 a-d a d
## 3 b-c b c
## 4 d-e d e
# without separator
tibble(
variable = c("x1", "x2", "y1", "y2")
) %>%
extract(
variable,
into = c("letter", "number"),
regex = "([xy])(\\d)",
remove = FALSE
)## # A tibble: 4 × 3
## variable letter number
## <chr> <chr> <chr>
## 1 x1 x 1
## 2 x2 x 2
## 3 y1 y 1
## 4 y2 y 2
tibble(
variable = c(
"David Jude Heyworth Law", "Elton Hercules John",
"Angelina Jolie Voight", "Jennifer Shrader Lawrence"
)
) %>%
extract(
variable,
into = c("short name", "remainder"),
regex = "(\\w+) .* (\\w+)",
remove = FALSE
)## # A tibble: 4 × 3
## variable `short name` remainder
## <chr> <chr> <chr>
## 1 David Jude Heyworth Law David Law
## 2 Elton Hercules John Elton John
## 3 Angelina Jolie Voight Angelina Voight
## 4 Jennifer Shrader Lawrence Jennifer Lawrence
# extracting from non-grouping parentheses
tibble(
variable = c(
"x -> 1",
"y -> 2",
"p-> 34"
)
) %>%
extract(
variable,
into = c("letter", "number"),
remove = FALSE,
regex = "([a-z])(?: ?-> ?)(\\d+)?"
)## # A tibble: 3 × 3
## variable letter number
## <chr> <chr> <chr>
## 1 x -> 1 x 1
## 2 y -> 2 y 2
## 3 p-> 34 p 34
# another example
df <- tibble(
variable = c(
"x ->-> 1",
"y -> 2",
"p-> 34",
"f 4"
)
)
df %>%
extract(
variable,
into = c("letter", "number"),
remove = FALSE,
regex = "([a-z]) ?(?:->){0,} ?(\\d+)?"
)## # A tibble: 4 × 3
## variable letter number
## <chr> <chr> <chr>
## 1 x ->-> 1 x 1
## 2 y -> 2 y 2
## 3 p-> 34 p 34
## 4 f 4 f 4
# another one
df <- tibble(
variable = c(
"x ->aslkdfj 1", "y-> 2",
"p 34",
"8"
)
)
df %>%
extract(
variable,
into = c("letter", "number"),
remove = FALSE,
regex = "([a-z])? ?(?:->\\w*)? ?(\\d+)"
)## # A tibble: 4 × 3
## variable letter number
## <chr> <chr> <chr>
## 1 x ->aslkdfj 1 "x" 1
## 2 y-> 2 "y" 2
## 3 p 34 "p" 34
## 4 8 "" 8
# last example
tibble(
value = c(
"3.10 = AX",
"3.1345 = AX:?_40",
"3.8983 =:$15",
".873 = PFS:4"
)
) %>%
extract(
value,
into = c("v0", "v2", "v3", "v4"),
regex = "(\\d)?\\.(\\d+) ?= ?(?:(\\w+)?:?)?(?:[?_$]*)(\\d+)?",
remove = FALSE
)## # A tibble: 4 × 5
## value v0 v2 v3 v4
## <chr> <chr> <chr> <chr> <chr>
## 1 3.10 = AX "3" 10 "AX" ""
## 2 3.1345 = AX:?_40 "3" 1345 "AX" "40"
## 3 3.8983 =:$15 "3" 8983 "" "15"
## 4 .873 = PFS:4 "" 873 "PFS" "4"
# another example
results_regexp <- results %>%
mutate(
LastName = str_replace_all(Candidate, ".*\\s(.*?)\\,?$", "\\1")
)
tail(results_regexp)## Candidate Pct_Vote LastName
## John A. Stefanini 0.291895856 Stefanini
## Dhruba P. Sen 0.009259259 Sen
## Mark S. Tilden 0.040245691 Tilden
## Yvonne M. Spicer 0.547029703 Spicer
## Benjaman A. Neves-Grigg, 0.012284562 Neves-Grigg
## Priscila Sousa 0.049321599 Sousa
Techniques for anonymizing and
pseudoanonymizing columns avoid data breaches that they
are potentially dangerous for those affected. For that We will find out
how to use the function fct_anon, how to replace names with
random names, how to mask values, how to group numeric variables, how to
remove house numbers from street names, and how to encode and decode
values.The difference between pseudonymization and anonymization is that
pseudonymization is reversible, while anonymization is not.
The EU defines pseudonymization as follows:
“The processing of personal data in such a manner that the personal data can no longer be attributed to a specific data subject without the use of additional information provided that such additional information is kept separately and is subject to technical and organisational measures to ensure that the personal data are not attributed to an identified or identifiable natural person.” (https://edps.europa.eu/system/files/2021-04/21-04-27_aepdedps_ anonymisation_en_5.pdf)
By this definition, pseudonymization is reversible and requires additional information to reverse the process.
Sometimes you want to make your data completely anonymous so that
other people can’t see sensitive information. A simple function to
anonymize such discrete data is fct_anon. The function
takes two arguments. The factor you want to anonymize, and the prefix
you put in front of the anonymized factor. The numbers are generated
randomly. So, each time you run this code, you will get a different set
of numbers.
Names are also sensitive data. To anonymize names, you can simply
replace them with random names. This can be done with the
randomNames function from the randomNames
package. You get a different set of names each time you run the
function. If we want to be more specific about how the names are
generated, we can provide some additional information to the
function.
Another common use case is the masking of values.
Masking is a technique that hides some characters of a string. Mostly by
“X”s. The .x stands for the piped variable (in this case
height). Then I provide a regular expression that searches for the last
character of the string (.$). This character should then be
replaced by an X. The regular expression ^.{10} indicates
that we are looking for the first 10 characters of the string. We
replace this pattern with 10 “X”s, specified by
strrep("X", 10). The function strrep is a
basic function of R, which simply repeats a series of characters.
Another common technique for anonymizing data is to divide it
into groups. With the function cut_width we can
create groups of arbitrary width from a numeric variable. The round
bracket means that a number is not included in the set. The square
bracket means that a number is included in the set. The function
cut_number creates a certain number of sets. Note, however,
that the width of each group varies.
Finally, we can anonymize each column by encrypting it. When we encrypt a column, we convert the values of a column into another form, which we call ciphertext. The ciphertext is not readable by humans, but it can be converted back to the original value. There are two forms of encryption. Symmetric encryption, where a single key is used to encrypt and decrypt a value, and asymmetric encryption, where two keys are used to encrypt and decrypt a value. A key is plaintext that translates between the two representations. Once you have the key in symmetric encryption, you can decrypt values. To decrypt values in asymmetric encryption, you need the public key and the private key. The public key is as it says public, so open to anyone. The private key is a key you should not share with anyone. Only when you have both, can you decrypt a value. Also, private key cannot be guessed from the public key. To add another level of security, the private key also sometimes has a passphrase (or password) to it.
We can encrypt this data with the package encryptr.
First we need to load the package and create the private and public keys
using the genkeys function. The function prompted us to
provide a passphrase for the private key. This passphrase and the
private key should not be shared with anyone! Once we have the
passphrase, we can encrypt our columns. To decrypt the column, we simply
use the decrypt function. You must provide the passphrase
to decrypt the column. Also, this works only if the R file is in the
same directory as the public and private keys.
levels(gss_cat$relig)## [1] "No answer" "Don't know"
## [3] "Inter-nondenominational" "Native american"
## [5] "Christian" "Orthodox-christian"
## [7] "Moslem/islam" "Other eastern"
## [9] "Hinduism" "Buddhism"
## [11] "Other" "None"
## [13] "Jewish" "Catholic"
## [15] "Protestant" "Not applicable"
# converting these levels into numeric values and add a prefix to them
gss_cat %>%
mutate(
relig = fct_anon(relig, prefix = "religion_")
) %>%
glimpse()## Rows: 21,483
## Columns: 9
## $ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ age <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ race <fct> White, White, White, White, White, White, White, White, White,…
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ relig <fct> religion_14, religion_14, religion_14, religion_01, religion_0…
## $ denom <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…
# anonymizing names
presidential %>%
mutate(
name = randomNames(nrow(.),
sample.with.replacement = FALSE
)
)## # A tibble: 11 × 4
## name start end party
## <chr> <date> <date> <chr>
## 1 Garcia, Deven 1953-01-20 1961-01-20 Republican
## 2 Young, Dejsha 1961-01-20 1963-11-22 Democratic
## 3 Robinson, Dante 1963-11-22 1969-01-20 Democratic
## 4 Cassetta, James 1969-01-20 1974-08-09 Republican
## 5 Kloster, Leeann 1974-08-09 1977-01-20 Republican
## 6 Vossler, Michaelann 1977-01-20 1981-01-20 Democratic
## 7 Rodriguez, Sonali 1981-01-20 1989-01-20 Republican
## 8 Jones, Jordan 1989-01-20 1993-01-20 Republican
## 9 el-Fares, Raihaan 1993-01-20 2001-01-20 Democratic
## 10 Craion, Isaac 2001-01-20 2009-01-20 Republican
## 11 Wilson, Lucero 2009-01-20 2017-01-20 Democratic
# customizing random names
presidential %>%
mutate(
name = randomNames(nrow(.),
sample.with.replacement = FALSE,
ethnicity = c(1, 2, 4),
name.order = "first.last",
name.sep = " "
)
)## # A tibble: 11 × 4
## name start end party
## <chr> <date> <date> <chr>
## 1 John Ward 1953-01-20 1961-01-20 Republican
## 2 Joshua Xiong 1961-01-20 1963-11-22 Democratic
## 3 Alejandro Soliz 1963-11-22 1969-01-20 Democratic
## 4 Cambria Cdebaca 1969-01-20 1974-08-09 Republican
## 5 Hea Hildreth 1974-08-09 1977-01-20 Republican
## 6 Nancy Garcia 1977-01-20 1981-01-20 Democratic
## 7 Noe Chacon 1981-01-20 1989-01-20 Republican
## 8 Mackenzie Matagaono 1989-01-20 1993-01-20 Republican
## 9 Blanca Godoy 1993-01-20 2001-01-20 Democratic
## 10 Kaile Hammond 2001-01-20 2009-01-20 Republican
## 11 Kevin Blanco 2009-01-20 2017-01-20 Democratic
# masking values
starwars %>%
mutate(
height = map_chr(height, ~ str_replace(.x, ".$", "X"))
)## # A tibble: 87 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 17X 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 16X 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 9X 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 20X 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 15X 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 17X 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 16X 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 9X 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 18X 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 18X 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
# masking more than one character
ccards <- tibble(
creditcards = c(
36555224524299,
36350489667466,
36002887965170,
5447552069207504,
2221002654361034,
5127699386148536
)
)
# converting the first 10 characters
ccards %>%
mutate(
creditcards_masked = map_chr(creditcards, ~ str_replace(.x, "^.{10}",
replacement = strrep("X", 10)
))
)## # A tibble: 6 × 2
## creditcards creditcards_masked
## <dbl> <chr>
## 1 3.66e13 XXXXXXXXXX4299
## 2 3.64e13 XXXXXXXXXX7466
## 3 3.60e13 XXXXXXXXXX5170
## 4 5.45e15 XXXXXXXXXX207504
## 5 2.22e15 XXXXXXXXXX361034
## 6 5.13e15 XXXXXXXXXX148536
# converting the last 5 characters
ccards %>%
mutate(
creditcars = map_chr(creditcards, ~ str_replace(.x, "\\d{5}$",
replacement = strrep("X", 5)
))
)## # A tibble: 6 × 2
## creditcards creditcars
## <dbl> <chr>
## 1 3.66e13 365552245XXXXX
## 2 3.64e13 363504896XXXXX
## 3 3.60e13 360028879XXXXX
## 4 5.45e15 54475520692XXXXX
## 5 2.22e15 22210026543XXXXX
## 6 5.13e15 51276993861XXXXX
# dividing values into groups
# get the data
(age_starwars <- starwars %>%
mutate(age = as.integer(format(Sys.Date(), "%Y")) - birth_year) %>%
select(name, age) %>%
drop_na(age))## # A tibble: 43 × 2
## name age
## <chr> <dbl>
## 1 Luke Skywalker 2003
## 2 C-3PO 1910
## 3 R2-D2 1989
## 4 Darth Vader 1980.
## 5 Leia Organa 2003
## 6 Owen Lars 1970
## 7 Beru Whitesun lars 1975
## 8 Biggs Darklighter 1998
## 9 Obi-Wan Kenobi 1965
## 10 Anakin Skywalker 1980.
## # … with 33 more rows
# using cut_width()
age_starwars %>%
mutate(
age_groups = cut_width(age, 10)
)## # A tibble: 43 × 3
## name age age_groups
## <chr> <dbl> <fct>
## 1 Luke Skywalker 2003 (1995,2005]
## 2 C-3PO 1910 (1905,1915]
## 3 R2-D2 1989 (1985,1995]
## 4 Darth Vader 1980. (1975,1985]
## 5 Leia Organa 2003 (1995,2005]
## 6 Owen Lars 1970 (1965,1975]
## 7 Beru Whitesun lars 1975 (1965,1975]
## 8 Biggs Darklighter 1998 (1995,2005]
## 9 Obi-Wan Kenobi 1965 (1955,1965]
## 10 Anakin Skywalker 1980. (1975,1985]
## # … with 33 more rows
# using cut_number()
age_starwars %>%
mutate(
age_groups = cut_number(age, 10)
)## # A tibble: 43 × 3
## name age age_groups
## <chr> <dbl> <fct>
## 1 Luke Skywalker 2003 (2001,2014]
## 2 C-3PO 1910 [1126,1922]
## 3 R2-D2 1989 (1981,1991]
## 4 Darth Vader 1980. (1976,1981]
## 5 Leia Organa 2003 (2001,2014]
## 6 Owen Lars 1970 (1965,1970]
## 7 Beru Whitesun lars 1975 (1970,1976]
## 8 Biggs Darklighter 1998 (1991,2001]
## 9 Obi-Wan Kenobi 1965 (1965,1970]
## 10 Anakin Skywalker 1980. (1976,1981]
## # … with 33 more rows
# converting to millennia
age_starwars %>%
mutate(
millenium = 1000 * (age %/% 1000)
)## # A tibble: 43 × 3
## name age millenium
## <chr> <dbl> <dbl>
## 1 Luke Skywalker 2003 2000
## 2 C-3PO 1910 1000
## 3 R2-D2 1989 1000
## 4 Darth Vader 1980. 1000
## 5 Leia Organa 2003 2000
## 6 Owen Lars 1970 1000
## 7 Beru Whitesun lars 1975 1000
## 8 Biggs Darklighter 1998 1000
## 9 Obi-Wan Kenobi 1965 1000
## 10 Anakin Skywalker 1980. 1000
## # … with 33 more rows
# converting to decades
age_starwars %>%
mutate(
decade = 10 * (age %/% 10)
)## # A tibble: 43 × 3
## name age decade
## <chr> <dbl> <dbl>
## 1 Luke Skywalker 2003 2000
## 2 C-3PO 1910 1910
## 3 R2-D2 1989 1980
## 4 Darth Vader 1980. 1980
## 5 Leia Organa 2003 2000
## 6 Owen Lars 1970 1970
## 7 Beru Whitesun lars 1975 1970
## 8 Biggs Darklighter 1998 1990
## 9 Obi-Wan Kenobi 1965 1960
## 10 Anakin Skywalker 1980. 1980
## # … with 33 more rows
# removing house numbers from street names
street_names <- tibble(
street_name = c(
"Bromley Lanes 34",
"Woodsgate Avenue 12",
"Ardconnel Terrace 99",
"Gipsy Birches 45",
"Legate Close 8",
"Stevenson Oval 9",
"St Leonard's Boulevard 112",
"Copper Chare 435",
"Glastonbury Glebe 82",
"Southern Way 91"
)
)
street_names %>%
mutate(
street_names_no_number = str_remove_all(street_name, "\\d")
)## # A tibble: 10 × 2
## street_name street_names_no_number
## <chr> <chr>
## 1 Bromley Lanes 34 "Bromley Lanes "
## 2 Woodsgate Avenue 12 "Woodsgate Avenue "
## 3 Ardconnel Terrace 99 "Ardconnel Terrace "
## 4 Gipsy Birches 45 "Gipsy Birches "
## 5 Legate Close 8 "Legate Close "
## 6 Stevenson Oval 9 "Stevenson Oval "
## 7 St Leonard's Boulevard 112 "St Leonard's Boulevard "
## 8 Copper Chare 435 "Copper Chare "
## 9 Glastonbury Glebe 82 "Glastonbury Glebe "
## 10 Southern Way 91 "Southern Way "
# encrypting and decrypting columns
users <- tibble(
name = c("Alexander", "Marie", "John"),
password = c(12345, "8$43_45*", "becker23#")
)
# genkeys() generates a public and private key pair
# Passphrase: 456#7
# genkeys()
# encrypting a column
# users_encrypted <- users %>%
# encrypt(password)
# users_encrypted %>%
# glimpse()
# decrypting the column
# users_encrypted %>% decrypt(password)Sometimes it makes sense to spread an observation over multiple rows (long format), and sometimes it makes more sense to spread a variable across multiple columns (wide format). Some analyses require long data, whereas others require wide data.
When going from a long format to a wide format, you
choose columns to group the observations by (in the gapminder
case: country and maybe also continent),
columns to take values names from (lifeExp,
pop and gdpPercap), and columns to create
variable names from (year). In data.table, the transformation
from long to wide is done using the dcast function.
In data.table, wide-to-long formatting is
done using melt.
gm <- as.data.table(gapminder)
head(gm)## country continent year lifeExp pop gdpPercap
## 1: Afghanistan Asia 1952 28.801 8425333 779.4453
## 2: Afghanistan Asia 1957 30.332 9240934 820.8530
## 3: Afghanistan Asia 1962 31.997 10267083 853.1007
## 4: Afghanistan Asia 1967 34.020 11537966 836.1971
## 5: Afghanistan Asia 1972 36.088 13079460 739.9811
## 6: Afghanistan Asia 1977 38.438 14880372 786.1134
# from long to wide
gmw <- dcast(gm, country + continent ~ year, value.var = c("pop", "lifeExp", "gdpPercap"))
head(gmw)## country continent pop_1952 pop_1957 pop_1962 pop_1967 pop_1972 pop_1977
## 1: Afghanistan Asia 8425333 9240934 10267083 11537966 13079460 14880372
## 2: Albania Europe 1282697 1476505 1728137 1984060 2263554 2509048
## 3: Algeria Africa 9279525 10270856 11000948 12760499 14760787 17152804
## 4: Angola Africa 4232095 4561361 4826015 5247469 5894858 6162675
## 5: Argentina Americas 17876956 19610538 21283783 22934225 24779799 26983828
## 6: Australia Oceania 8691212 9712569 10794968 11872264 13177000 14074100
## pop_1982 pop_1987 pop_1992 pop_1997 pop_2002 pop_2007 lifeExp_1952
## 1: 12881816 13867957 16317921 22227415 25268405 31889923 28.801
## 2: 2780097 3075321 3326498 3428038 3508512 3600523 55.230
## 3: 20033753 23254956 26298373 29072015 31287142 33333216 43.077
## 4: 7016384 7874230 8735988 9875024 10866106 12420476 30.015
## 5: 29341374 31620918 33958947 36203463 38331121 40301927 62.485
## 6: 15184200 16257249 17481977 18565243 19546792 20434176 69.120
## lifeExp_1957 lifeExp_1962 lifeExp_1967 lifeExp_1972 lifeExp_1977
## 1: 30.332 31.997 34.020 36.088 38.438
## 2: 59.280 64.820 66.220 67.690 68.930
## 3: 45.685 48.303 51.407 54.518 58.014
## 4: 31.999 34.000 35.985 37.928 39.483
## 5: 64.399 65.142 65.634 67.065 68.481
## 6: 70.330 70.930 71.100 71.930 73.490
## lifeExp_1982 lifeExp_1987 lifeExp_1992 lifeExp_1997 lifeExp_2002
## 1: 39.854 40.822 41.674 41.763 42.129
## 2: 70.420 72.000 71.581 72.950 75.651
## 3: 61.368 65.799 67.744 69.152 70.994
## 4: 39.942 39.906 40.647 40.963 41.003
## 5: 69.942 70.774 71.868 73.275 74.340
## 6: 74.740 76.320 77.560 78.830 80.370
## lifeExp_2007 gdpPercap_1952 gdpPercap_1957 gdpPercap_1962 gdpPercap_1967
## 1: 43.828 779.4453 820.853 853.1007 836.1971
## 2: 76.423 1601.0561 1942.284 2312.8890 2760.1969
## 3: 72.301 2449.0082 3013.976 2550.8169 3246.9918
## 4: 42.731 3520.6103 3827.940 4269.2767 5522.7764
## 5: 75.320 5911.3151 6856.856 7133.1660 8052.9530
## 6: 81.235 10039.5956 10949.650 12217.2269 14526.1246
## gdpPercap_1972 gdpPercap_1977 gdpPercap_1982 gdpPercap_1987 gdpPercap_1992
## 1: 739.9811 786.1134 978.0114 852.3959 649.3414
## 2: 3313.4222 3533.0039 3630.8807 3738.9327 2497.4379
## 3: 4182.6638 4910.4168 5745.1602 5681.3585 5023.2166
## 4: 5473.2880 3008.6474 2756.9537 2430.2083 2627.8457
## 5: 9443.0385 10079.0267 8997.8974 9139.6714 9308.4187
## 6: 16788.6295 18334.1975 19477.0093 21888.8890 23424.7668
## gdpPercap_1997 gdpPercap_2002 gdpPercap_2007
## 1: 635.3414 726.7341 974.5803
## 2: 3193.0546 4604.2117 5937.0295
## 3: 4797.2951 5288.0404 6223.3675
## 4: 2277.1409 2773.2873 4797.2313
## 5: 10967.2820 8797.6407 12779.3796
## 6: 26997.9366 30687.7547 34435.3674
# wide-to-long (not the best approach!)
gm <- melt(gmw, id.vars = c("country", "continent"), measure.vars = 2:37)
head(gm)## country continent variable value
## 1: Afghanistan Asia continent Asia
## 2: Albania Europe continent Europe
## 3: Algeria Africa continent Africa
## 4: Angola Africa continent Africa
## 5: Argentina Americas continent Americas
## 6: Australia Oceania continent Oceania
# splitting columns and casting
gm[, c("variable", "year") := tstrsplit(variable, "_", fixed = TRUE)]
gm <- dcast(gm, country + year ~ variable, value.var = c("value"))
head(gm)## country year continent gdpPercap lifeExp pop
## 1: Afghanistan <NA> Asia <NA> <NA> <NA>
## 2: Afghanistan 1952 <NA> 779.4453145 28.801 8425333
## 3: Afghanistan 1957 <NA> 820.8530296 30.332 9240934
## 4: Afghanistan 1962 <NA> 853.10071 31.997 10267083
## 5: Afghanistan 1967 <NA> 836.1971382 34.02 11537966
## 6: Afghanistan 1972 <NA> 739.9811058 36.088 13079460
# merging columns
aq <- as.data.table(airquality)
head(aq)## Ozone Solar.R Wind Temp Month Day
## 1: 41 190 7.4 67 5 1
## 2: 36 118 8.0 72 5 2
## 3: 12 149 12.6 74 5 3
## 4: 18 313 11.5 62 5 4
## 5: NA NA 14.3 56 5 5
## 6: 28 NA 14.9 66 5 6
# creating a new column Date and mergin Month and Day into it (date format)
aq[, Date := as.Date(paste(1973, aq$Month, aq$Day, sep = "-"))]
head(aq)## Ozone Solar.R Wind Temp Month Day Date
## 1: 41 190 7.4 67 5 1 1973-05-01
## 2: 36 118 8.0 72 5 2 1973-05-02
## 3: 12 149 12.6 74 5 3 1973-05-03
## 4: 18 313 11.5 62 5 4 1973-05-04
## 5: NA NA 14.3 56 5 5 1973-05-05
## 6: 28 NA 14.9 66 5 6 1973-05-06
It is common that data is spread over multiple tables. Consequently, it is important to be able to merge data from different tables. The simplest types of merges are binds, which can be used when you have two tables where either the rows or the columns match each other exactly.
An operation that combines columns from two tables is called a join. There are two main types of joins: inner joins and outer joins.
rev_data and weather_data
tables using DATE as the key, it won”t contain data for the
days that are missing from either the revenue table or the weather
table.In contrast, outer joins create a table retaining rows, even if there is no match in the other table. There are three types of outer joins:
rev_data and
weather_data.Semijoins and antijoins are similar to joins, but work on observations rather than variables. That is, they are used for filtering one table using data from another table:
The same thing can be achieved using the filtering techniques, but semijoins and antijoins are simpler to use when the filtering relies on conditions from another table.
# preparing the data
rev_data <- read.csv("input/sales-rev.csv", sep = ";")
weather_data <- read.csv("input/sales-weather.csv", sep = ";")
rev_data <- as.data.table(rev_data)
rev_data$DATE <- as.Date(rev_data$DATE)
weather_data <- as.data.table(weather_data)
weather_data$DATE <- as.Date(weather_data$DATE)
rev_jan <- rev_data[DATE %between% c("2020-01-01", "2020-01-31"), ]
rev_feb <- rev_data[DATE %between% c("2020-02-01", "2020-02-29"), ]
weather_jan <- weather_data[DATE %between% c("2020-01-01", "2020-01-31"), ]
str(rev_jan) ## Classes 'data.table' and 'data.frame': 31 obs. of 2 variables:
## $ DATE : Date, format: "2020-01-01" "2020-01-02" ...
## $ REVENUE: int 7637 9276 11170 11863 10880 6702 8652 8346 6543 8115 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(rev_feb) ## Classes 'data.table' and 'data.frame': 29 obs. of 2 variables:
## $ DATE : Date, format: "2020-02-01" "2020-02-02" ...
## $ REVENUE: int 10192 13904 11208 8578 6638 7093 8187 10099 8160 3797 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(weather_jan)## Classes 'data.table' and 'data.frame': 31 obs. of 5 variables:
## $ DATE : Date, format: "2020-01-01" "2020-01-02" ...
## $ SUN_HOURS : num 0 0.372 0.264 3.549 2.513 ...
## $ PRECIPITATION: num 0 0 0 0 2.4 0.2 0 0 0.6 0 ...
## $ SNOW_DEPTH : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TEMPERATURE : num 0 5.8 5.4 2.4 0.1 4.8 5.5 6.5 3.9 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# with Base-R
# Join columns of datasets that # have the same rows:
cbind(rev_jan, weather_jan)## DATE REVENUE DATE SUN_HOURS PRECIPITATION SNOW_DEPTH
## 1: 2020-01-01 7637 2020-01-01 0.00000000 0.0 0
## 2: 2020-01-02 9276 2020-01-02 0.37250000 0.0 0
## 3: 2020-01-03 11170 2020-01-03 0.26361111 0.0 0
## 4: 2020-01-04 11863 2020-01-04 3.54861111 0.0 0
## 5: 2020-01-05 10880 2020-01-05 2.51250000 2.4 0
## 6: 2020-01-06 6702 2020-01-06 0.09333333 0.2 0
## 7: 2020-01-07 8652 2020-01-07 0.00000000 0.0 0
## 8: 2020-01-08 8346 2020-01-08 0.16166667 0.0 0
## 9: 2020-01-09 6543 2020-01-09 0.31944444 0.6 0
## 10: 2020-01-10 8115 2020-01-10 1.58694444 0.0 0
## 11: 2020-01-11 7728 2020-01-11 0.00000000 0.2 0
## 12: 2020-01-12 10649 2020-01-12 0.44194444 0.0 0
## 13: 2020-01-13 6787 2020-01-13 2.42083333 0.0 0
## 14: 2020-01-14 4555 2020-01-14 0.00000000 7.4 0
## 15: 2020-01-15 5885 2020-01-15 0.00000000 0.0 0
## 16: 2020-01-16 10127 2020-01-16 0.85583333 0.0 0
## 17: 2020-01-17 8893 2020-01-17 0.00000000 0.0 0
## 18: 2020-01-18 12520 2020-01-18 0.05500000 0.0 0
## 19: 2020-01-19 11860 2020-01-19 6.18555556 0.0 0
## 20: 2020-01-20 8515 2020-01-20 0.77305556 0.0 0
## 21: 2020-01-21 8129 2020-01-21 2.09777778 0.0 0
## 22: 2020-01-22 10405 2020-01-22 3.55250000 0.0 0
## 23: 2020-01-23 6672 2020-01-23 2.02694444 0.0 0
## 24: 2020-01-24 12300 2020-01-24 4.97361111 0.0 0
## 25: 2020-01-25 10651 2020-01-25 4.45861111 0.0 0
## 26: 2020-01-26 11882 2020-01-26 0.00000000 0.0 0
## 27: 2020-01-27 9397 2020-01-27 0.00000000 0.6 0
## 28: 2020-01-28 5174 2020-01-28 0.00000000 10.9 0
## 29: 2020-01-29 4436 2020-01-29 0.00000000 1.5 0
## 30: 2020-01-30 6202 2020-01-30 0.00000000 0.2 0
## 31: 2020-01-31 9949 2020-01-31 0.00000000 2.0 0
## DATE REVENUE DATE SUN_HOURS PRECIPITATION SNOW_DEPTH
## TEMPERATURE
## 1: 0.0
## 2: 5.8
## 3: 5.4
## 4: 2.4
## 5: 0.1
## 6: 4.8
## 7: 5.5
## 8: 6.5
## 9: 3.9
## 10: 1.0
## 11: 1.2
## 12: 5.9
## 13: 2.6
## 14: 3.8
## 15: 7.8
## 16: 6.1
## 17: 4.8
## 18: 4.8
## 19: 3.1
## 20: 3.8
## 21: 6.8
## 22: 3.2
## 23: 1.5
## 24: 5.1
## 25: 2.1
## 26: 5.8
## 27: 4.8
## 28: 3.4
## 29: 3.3
## 30: 2.4
## 31: 3.0
## TEMPERATURE
# Join rows of datasets that have # the same columns:
rbind(rev_jan, rev_feb)## DATE REVENUE
## 1: 2020-01-01 7637
## 2: 2020-01-02 9276
## 3: 2020-01-03 11170
## 4: 2020-01-04 11863
## 5: 2020-01-05 10880
## 6: 2020-01-06 6702
## 7: 2020-01-07 8652
## 8: 2020-01-08 8346
## 9: 2020-01-09 6543
## 10: 2020-01-10 8115
## 11: 2020-01-11 7728
## 12: 2020-01-12 10649
## 13: 2020-01-13 6787
## 14: 2020-01-14 4555
## 15: 2020-01-15 5885
## 16: 2020-01-16 10127
## 17: 2020-01-17 8893
## 18: 2020-01-18 12520
## 19: 2020-01-19 11860
## 20: 2020-01-20 8515
## 21: 2020-01-21 8129
## 22: 2020-01-22 10405
## 23: 2020-01-23 6672
## 24: 2020-01-24 12300
## 25: 2020-01-25 10651
## 26: 2020-01-26 11882
## 27: 2020-01-27 9397
## 28: 2020-01-28 5174
## 29: 2020-01-29 4436
## 30: 2020-01-30 6202
## 31: 2020-01-31 9949
## 32: 2020-02-01 10192
## 33: 2020-02-02 13904
## 34: 2020-02-03 11208
## 35: 2020-02-04 8578
## 36: 2020-02-05 6638
## 37: 2020-02-06 7093
## 38: 2020-02-07 8187
## 39: 2020-02-08 10099
## 40: 2020-02-09 8160
## 41: 2020-02-10 3797
## 42: 2020-02-11 8734
## 43: 2020-02-12 4355
## 44: 2020-02-13 8452
## 45: 2020-02-14 7367
## 46: 2020-02-15 9339
## 47: 2020-02-16 5427
## 48: 2020-02-17 7022
## 49: 2020-02-18 8417
## 50: 2020-02-19 8760
## 51: 2020-02-20 4936
## 52: 2020-02-21 11273
## 53: 2020-02-22 7046
## 54: 2020-02-23 10074
## 55: 2020-02-24 9114
## 56: 2020-02-25 4493
## 57: 2020-02-26 4992
## 58: 2020-02-27 3789
## 59: 2020-02-28 7480
## 60: 2020-02-29 8358
## DATE REVENUE
# with dplyr
# Join columns of datasets that # have the same rows:
# bind_cols(rev_jan, weather_jan)
# Join rows of datasets that have # the same columns:
# bind_rows(rev_jan, rev_feb)
# merging tables using keys
# data.table
# inner join
merge(rev_data, weather_data, by = "DATE")## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-01-01 7637 0.00000000 0.0 0.00 0.0
## 2: 2020-01-02 9276 0.37250000 0.0 0.00 5.8
## 3: 2020-01-03 11170 0.26361111 0.0 0.00 5.4
## 4: 2020-01-04 11863 3.54861111 0.0 0.00 2.4
## 5: 2020-01-05 10880 2.51250000 2.4 0.00 0.1
## 6: 2020-01-06 6702 0.09333333 0.2 0.00 4.8
## 7: 2020-01-07 8652 0.00000000 0.0 0.00 5.5
## 8: 2020-01-08 8346 0.16166667 0.0 0.00 6.5
## 9: 2020-01-09 6543 0.31944444 0.6 0.00 3.9
## 10: 2020-01-10 8115 1.58694444 0.0 0.00 1.0
## 11: 2020-01-11 7728 0.00000000 0.2 0.00 1.2
## 12: 2020-01-12 10649 0.44194444 0.0 0.00 5.9
## 13: 2020-01-13 6787 2.42083333 0.0 0.00 2.6
## 14: 2020-01-14 4555 0.00000000 7.4 0.00 3.8
## 15: 2020-01-15 5885 0.00000000 0.0 0.00 7.8
## 16: 2020-01-16 10127 0.85583333 0.0 0.00 6.1
## 17: 2020-01-17 8893 0.00000000 0.0 0.00 4.8
## 18: 2020-01-18 12520 0.05500000 0.0 0.00 4.8
## 19: 2020-01-19 11860 6.18555556 0.0 0.00 3.1
## 20: 2020-01-20 8515 0.77305556 0.0 0.00 3.8
## 21: 2020-01-21 8129 2.09777778 0.0 0.00 6.8
## 22: 2020-01-22 10405 3.55250000 0.0 0.00 3.2
## 23: 2020-01-23 6672 2.02694444 0.0 0.00 1.5
## 24: 2020-01-24 12300 4.97361111 0.0 0.00 5.1
## 25: 2020-01-25 10651 4.45861111 0.0 0.00 2.1
## 26: 2020-01-26 11882 0.00000000 0.0 0.00 5.8
## 27: 2020-01-27 9397 0.00000000 0.6 0.00 4.8
## 28: 2020-01-28 5174 0.00000000 10.9 0.00 3.4
## 29: 2020-01-29 4436 0.00000000 1.5 0.00 3.3
## 30: 2020-01-30 6202 0.00000000 0.2 0.00 2.4
## 31: 2020-01-31 9949 0.00000000 2.0 0.00 3.0
## 32: 2020-02-01 10192 1.50750000 1.5 0.00 6.1
## 33: 2020-02-02 13904 6.70888889 0.6 0.00 3.8
## 34: 2020-02-03 11208 6.85888889 0.0 0.00 0.5
## 35: 2020-02-04 8578 0.09138889 0.2 0.00 -1.3
## 36: 2020-02-07 8187 1.15611111 0.0 0.00 0.1
## 37: 2020-02-08 10099 0.00000000 0.8 0.00 2.4
## 38: 2020-02-09 8160 1.14805556 6.4 0.00 4.5
## 39: 2020-02-10 3797 0.23944444 0.7 0.00 5.7
## 40: 2020-02-11 8734 0.69166667 0.0 0.00 4.0
## 41: 2020-02-12 4355 1.69333333 1.5 0.00 3.2
## 42: 2020-02-13 8452 8.53333333 0.0 0.00 1.0
## 43: 2020-02-14 7367 6.66250000 0.0 0.00 0.8
## 44: 2020-02-15 9339 0.01416667 4.6 0.00 3.8
## 45: 2020-02-16 5427 0.00000000 1.5 0.00 7.1
## 46: 2020-02-17 7022 1.55083333 0.1 0.00 6.6
## 47: 2020-02-18 8417 2.58805556 0.0 0.00 6.6
## 48: 2020-02-19 8760 7.62638889 0.0 0.00 4.5
## 49: 2020-02-20 4936 3.16527778 5.0 0.00 2.8
## 50: 2020-02-21 11273 7.49055556 0.0 0.00 5.7
## 51: 2020-02-22 7046 0.20055556 0.8 0.00 6.5
## 52: 2020-02-23 10074 6.78666667 0.0 0.00 5.2
## 53: 2020-02-24 9114 8.94694444 0.0 0.00 2.3
## 54: 2020-02-25 4493 2.53583333 1.1 0.00 1.4
## 55: 2020-02-26 4992 0.51944444 0.5 0.00 0.2
## 56: 2020-02-27 3789 0.00000000 0.8 0.01 -1.3
## 57: 2020-02-28 7480 6.55861111 0.2 0.05 -1.3
## 58: 2020-02-29 8358 2.25750000 4.5 0.05 1.1
## 59: 2020-03-02 6489 6.01888889 4.0 0.00 4.4
## 60: 2020-03-03 3586 0.00000000 7.2 0.00 3.7
## 61: 2020-03-04 2570 0.00000000 6.4 0.00 2.1
## 62: 2020-03-05 5059 0.04666667 0.1 0.00 2.1
## 63: 2020-03-06 9087 9.34805556 0.0 0.00 2.0
## 64: 2020-03-07 11570 2.85055556 0.0 0.00 1.9
## 65: 2020-03-08 10476 2.89916667 0.0 0.00 4.5
## 66: 2020-03-09 8977 3.60500000 2.0 0.00 6.8
## 67: 2020-03-11 5837 0.13555556 0.0 0.00 5.6
## 68: 2020-03-12 3559 0.66138889 15.2 0.00 4.9
## 69: 2020-03-13 7489 5.24972222 0.1 0.00 2.1
## 70: 2020-03-14 8138 11.19111111 0.0 0.00 -0.9
## 71: 2020-03-15 8876 5.43888889 2.1 0.00 2.0
## 72: 2020-03-16 4800 6.09472222 2.9 0.00 4.1
## 73: 2020-03-17 3836 0.59083333 0.0 0.00 5.7
## 74: 2020-03-18 2750 1.32916667 5.8 0.00 6.3
## 75: 2020-03-19 7715 11.57500000 0.1 0.00 5.5
## 76: 2020-03-20 4364 3.21694444 0.1 0.00 2.1
## 77: 2020-03-21 9393 10.37333333 0.0 0.00 0.0
## 78: 2020-03-22 7830 11.13527778 0.0 0.00 0.1
## 79: 2020-03-23 4858 6.43416667 0.0 0.00 2.8
## 80: 2020-03-24 5140 5.35000000 0.0 0.00 4.2
## 81: 2020-03-25 7043 6.95083333 0.0 0.00 7.0
## 82: 2020-03-26 8662 11.44222222 0.0 0.00 6.5
## 83: 2020-03-27 9196 9.69944444 0.0 0.00 6.2
## 84: 2020-03-28 8267 3.41638889 0.0 0.00 5.3
## 85: 2020-03-30 4249 7.36500000 0.1 0.00 0.1
## 86: 2020-03-31 8291 10.23666667 0.0 0.00 3.4
## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or:
# setkey(rev_data, DATE)
# rev_data[weather_data, nomatch = 0]
# # dplyr
# rev_data %>%
# inner_join( weather_data, by = "DATE")
# outer join (left)
# data.table
merge(rev_data, weather_data, all.x = TRUE, by = "DATE") ## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-01-01 7637 0.00000000 0.0 0.00 0.0
## 2: 2020-01-02 9276 0.37250000 0.0 0.00 5.8
## 3: 2020-01-03 11170 0.26361111 0.0 0.00 5.4
## 4: 2020-01-04 11863 3.54861111 0.0 0.00 2.4
## 5: 2020-01-05 10880 2.51250000 2.4 0.00 0.1
## 6: 2020-01-06 6702 0.09333333 0.2 0.00 4.8
## 7: 2020-01-07 8652 0.00000000 0.0 0.00 5.5
## 8: 2020-01-08 8346 0.16166667 0.0 0.00 6.5
## 9: 2020-01-09 6543 0.31944444 0.6 0.00 3.9
## 10: 2020-01-10 8115 1.58694444 0.0 0.00 1.0
## 11: 2020-01-11 7728 0.00000000 0.2 0.00 1.2
## 12: 2020-01-12 10649 0.44194444 0.0 0.00 5.9
## 13: 2020-01-13 6787 2.42083333 0.0 0.00 2.6
## 14: 2020-01-14 4555 0.00000000 7.4 0.00 3.8
## 15: 2020-01-15 5885 0.00000000 0.0 0.00 7.8
## 16: 2020-01-16 10127 0.85583333 0.0 0.00 6.1
## 17: 2020-01-17 8893 0.00000000 0.0 0.00 4.8
## 18: 2020-01-18 12520 0.05500000 0.0 0.00 4.8
## 19: 2020-01-19 11860 6.18555556 0.0 0.00 3.1
## 20: 2020-01-20 8515 0.77305556 0.0 0.00 3.8
## 21: 2020-01-21 8129 2.09777778 0.0 0.00 6.8
## 22: 2020-01-22 10405 3.55250000 0.0 0.00 3.2
## 23: 2020-01-23 6672 2.02694444 0.0 0.00 1.5
## 24: 2020-01-24 12300 4.97361111 0.0 0.00 5.1
## 25: 2020-01-25 10651 4.45861111 0.0 0.00 2.1
## 26: 2020-01-26 11882 0.00000000 0.0 0.00 5.8
## 27: 2020-01-27 9397 0.00000000 0.6 0.00 4.8
## 28: 2020-01-28 5174 0.00000000 10.9 0.00 3.4
## 29: 2020-01-29 4436 0.00000000 1.5 0.00 3.3
## 30: 2020-01-30 6202 0.00000000 0.2 0.00 2.4
## 31: 2020-01-31 9949 0.00000000 2.0 0.00 3.0
## 32: 2020-02-01 10192 1.50750000 1.5 0.00 6.1
## 33: 2020-02-02 13904 6.70888889 0.6 0.00 3.8
## 34: 2020-02-03 11208 6.85888889 0.0 0.00 0.5
## 35: 2020-02-04 8578 0.09138889 0.2 0.00 -1.3
## 36: 2020-02-05 6638 NA NA NA NA
## 37: 2020-02-06 7093 NA NA NA NA
## 38: 2020-02-07 8187 1.15611111 0.0 0.00 0.1
## 39: 2020-02-08 10099 0.00000000 0.8 0.00 2.4
## 40: 2020-02-09 8160 1.14805556 6.4 0.00 4.5
## 41: 2020-02-10 3797 0.23944444 0.7 0.00 5.7
## 42: 2020-02-11 8734 0.69166667 0.0 0.00 4.0
## 43: 2020-02-12 4355 1.69333333 1.5 0.00 3.2
## 44: 2020-02-13 8452 8.53333333 0.0 0.00 1.0
## 45: 2020-02-14 7367 6.66250000 0.0 0.00 0.8
## 46: 2020-02-15 9339 0.01416667 4.6 0.00 3.8
## 47: 2020-02-16 5427 0.00000000 1.5 0.00 7.1
## 48: 2020-02-17 7022 1.55083333 0.1 0.00 6.6
## 49: 2020-02-18 8417 2.58805556 0.0 0.00 6.6
## 50: 2020-02-19 8760 7.62638889 0.0 0.00 4.5
## 51: 2020-02-20 4936 3.16527778 5.0 0.00 2.8
## 52: 2020-02-21 11273 7.49055556 0.0 0.00 5.7
## 53: 2020-02-22 7046 0.20055556 0.8 0.00 6.5
## 54: 2020-02-23 10074 6.78666667 0.0 0.00 5.2
## 55: 2020-02-24 9114 8.94694444 0.0 0.00 2.3
## 56: 2020-02-25 4493 2.53583333 1.1 0.00 1.4
## 57: 2020-02-26 4992 0.51944444 0.5 0.00 0.2
## 58: 2020-02-27 3789 0.00000000 0.8 0.01 -1.3
## 59: 2020-02-28 7480 6.55861111 0.2 0.05 -1.3
## 60: 2020-02-29 8358 2.25750000 4.5 0.05 1.1
## 61: 2020-03-02 6489 6.01888889 4.0 0.00 4.4
## 62: 2020-03-03 3586 0.00000000 7.2 0.00 3.7
## 63: 2020-03-04 2570 0.00000000 6.4 0.00 2.1
## 64: 2020-03-05 5059 0.04666667 0.1 0.00 2.1
## 65: 2020-03-06 9087 9.34805556 0.0 0.00 2.0
## 66: 2020-03-07 11570 2.85055556 0.0 0.00 1.9
## 67: 2020-03-08 10476 2.89916667 0.0 0.00 4.5
## 68: 2020-03-09 8977 3.60500000 2.0 0.00 6.8
## 69: 2020-03-10 4509 NA NA NA NA
## 70: 2020-03-11 5837 0.13555556 0.0 0.00 5.6
## 71: 2020-03-12 3559 0.66138889 15.2 0.00 4.9
## 72: 2020-03-13 7489 5.24972222 0.1 0.00 2.1
## 73: 2020-03-14 8138 11.19111111 0.0 0.00 -0.9
## 74: 2020-03-15 8876 5.43888889 2.1 0.00 2.0
## 75: 2020-03-16 4800 6.09472222 2.9 0.00 4.1
## 76: 2020-03-17 3836 0.59083333 0.0 0.00 5.7
## 77: 2020-03-18 2750 1.32916667 5.8 0.00 6.3
## 78: 2020-03-19 7715 11.57500000 0.1 0.00 5.5
## 79: 2020-03-20 4364 3.21694444 0.1 0.00 2.1
## 80: 2020-03-21 9393 10.37333333 0.0 0.00 0.0
## 81: 2020-03-22 7830 11.13527778 0.0 0.00 0.1
## 82: 2020-03-23 4858 6.43416667 0.0 0.00 2.8
## 83: 2020-03-24 5140 5.35000000 0.0 0.00 4.2
## 84: 2020-03-25 7043 6.95083333 0.0 0.00 7.0
## 85: 2020-03-26 8662 11.44222222 0.0 0.00 6.5
## 86: 2020-03-27 9196 9.69944444 0.0 0.00 6.2
## 87: 2020-03-28 8267 3.41638889 0.0 0.00 5.3
## 88: 2020-03-29 8237 NA NA NA NA
## 89: 2020-03-30 4249 7.36500000 0.1 0.00 0.1
## 90: 2020-03-31 8291 10.23666667 0.0 0.00 3.4
## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or:
# setkey(weather_data, DATE)
# weather_data[rev_data]
# dplyr
# rev_data %>%
# left_join( weather_data, by = "DATE")
# outer join (right)
# data.table
merge(rev_data, weather_data, all.y = TRUE, by = "DATE") ## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-01-01 7637 0.00000000 0.0 0.00 0.0
## 2: 2020-01-02 9276 0.37250000 0.0 0.00 5.8
## 3: 2020-01-03 11170 0.26361111 0.0 0.00 5.4
## 4: 2020-01-04 11863 3.54861111 0.0 0.00 2.4
## 5: 2020-01-05 10880 2.51250000 2.4 0.00 0.1
## 6: 2020-01-06 6702 0.09333333 0.2 0.00 4.8
## 7: 2020-01-07 8652 0.00000000 0.0 0.00 5.5
## 8: 2020-01-08 8346 0.16166667 0.0 0.00 6.5
## 9: 2020-01-09 6543 0.31944444 0.6 0.00 3.9
## 10: 2020-01-10 8115 1.58694444 0.0 0.00 1.0
## 11: 2020-01-11 7728 0.00000000 0.2 0.00 1.2
## 12: 2020-01-12 10649 0.44194444 0.0 0.00 5.9
## 13: 2020-01-13 6787 2.42083333 0.0 0.00 2.6
## 14: 2020-01-14 4555 0.00000000 7.4 0.00 3.8
## 15: 2020-01-15 5885 0.00000000 0.0 0.00 7.8
## 16: 2020-01-16 10127 0.85583333 0.0 0.00 6.1
## 17: 2020-01-17 8893 0.00000000 0.0 0.00 4.8
## 18: 2020-01-18 12520 0.05500000 0.0 0.00 4.8
## 19: 2020-01-19 11860 6.18555556 0.0 0.00 3.1
## 20: 2020-01-20 8515 0.77305556 0.0 0.00 3.8
## 21: 2020-01-21 8129 2.09777778 0.0 0.00 6.8
## 22: 2020-01-22 10405 3.55250000 0.0 0.00 3.2
## 23: 2020-01-23 6672 2.02694444 0.0 0.00 1.5
## 24: 2020-01-24 12300 4.97361111 0.0 0.00 5.1
## 25: 2020-01-25 10651 4.45861111 0.0 0.00 2.1
## 26: 2020-01-26 11882 0.00000000 0.0 0.00 5.8
## 27: 2020-01-27 9397 0.00000000 0.6 0.00 4.8
## 28: 2020-01-28 5174 0.00000000 10.9 0.00 3.4
## 29: 2020-01-29 4436 0.00000000 1.5 0.00 3.3
## 30: 2020-01-30 6202 0.00000000 0.2 0.00 2.4
## 31: 2020-01-31 9949 0.00000000 2.0 0.00 3.0
## 32: 2020-02-01 10192 1.50750000 1.5 0.00 6.1
## 33: 2020-02-02 13904 6.70888889 0.6 0.00 3.8
## 34: 2020-02-03 11208 6.85888889 0.0 0.00 0.5
## 35: 2020-02-04 8578 0.09138889 0.2 0.00 -1.3
## 36: 2020-02-07 8187 1.15611111 0.0 0.00 0.1
## 37: 2020-02-08 10099 0.00000000 0.8 0.00 2.4
## 38: 2020-02-09 8160 1.14805556 6.4 0.00 4.5
## 39: 2020-02-10 3797 0.23944444 0.7 0.00 5.7
## 40: 2020-02-11 8734 0.69166667 0.0 0.00 4.0
## 41: 2020-02-12 4355 1.69333333 1.5 0.00 3.2
## 42: 2020-02-13 8452 8.53333333 0.0 0.00 1.0
## 43: 2020-02-14 7367 6.66250000 0.0 0.00 0.8
## 44: 2020-02-15 9339 0.01416667 4.6 0.00 3.8
## 45: 2020-02-16 5427 0.00000000 1.5 0.00 7.1
## 46: 2020-02-17 7022 1.55083333 0.1 0.00 6.6
## 47: 2020-02-18 8417 2.58805556 0.0 0.00 6.6
## 48: 2020-02-19 8760 7.62638889 0.0 0.00 4.5
## 49: 2020-02-20 4936 3.16527778 5.0 0.00 2.8
## 50: 2020-02-21 11273 7.49055556 0.0 0.00 5.7
## 51: 2020-02-22 7046 0.20055556 0.8 0.00 6.5
## 52: 2020-02-23 10074 6.78666667 0.0 0.00 5.2
## 53: 2020-02-24 9114 8.94694444 0.0 0.00 2.3
## 54: 2020-02-25 4493 2.53583333 1.1 0.00 1.4
## 55: 2020-02-26 4992 0.51944444 0.5 0.00 0.2
## 56: 2020-02-27 3789 0.00000000 0.8 0.01 -1.3
## 57: 2020-02-28 7480 6.55861111 0.2 0.05 -1.3
## 58: 2020-02-29 8358 2.25750000 4.5 0.05 1.1
## 59: 2020-03-01 NA 0.91555556 1.0 0.00 4.2
## 60: 2020-03-02 6489 6.01888889 4.0 0.00 4.4
## 61: 2020-03-03 3586 0.00000000 7.2 0.00 3.7
## 62: 2020-03-04 2570 0.00000000 6.4 0.00 2.1
## 63: 2020-03-05 5059 0.04666667 0.1 0.00 2.1
## 64: 2020-03-06 9087 9.34805556 0.0 0.00 2.0
## 65: 2020-03-07 11570 2.85055556 0.0 0.00 1.9
## 66: 2020-03-08 10476 2.89916667 0.0 0.00 4.5
## 67: 2020-03-09 8977 3.60500000 2.0 0.00 6.8
## 68: 2020-03-11 5837 0.13555556 0.0 0.00 5.6
## 69: 2020-03-12 3559 0.66138889 15.2 0.00 4.9
## 70: 2020-03-13 7489 5.24972222 0.1 0.00 2.1
## 71: 2020-03-14 8138 11.19111111 0.0 0.00 -0.9
## 72: 2020-03-15 8876 5.43888889 2.1 0.00 2.0
## 73: 2020-03-16 4800 6.09472222 2.9 0.00 4.1
## 74: 2020-03-17 3836 0.59083333 0.0 0.00 5.7
## 75: 2020-03-18 2750 1.32916667 5.8 0.00 6.3
## 76: 2020-03-19 7715 11.57500000 0.1 0.00 5.5
## 77: 2020-03-20 4364 3.21694444 0.1 0.00 2.1
## 78: 2020-03-21 9393 10.37333333 0.0 0.00 0.0
## 79: 2020-03-22 7830 11.13527778 0.0 0.00 0.1
## 80: 2020-03-23 4858 6.43416667 0.0 0.00 2.8
## 81: 2020-03-24 5140 5.35000000 0.0 0.00 4.2
## 82: 2020-03-25 7043 6.95083333 0.0 0.00 7.0
## 83: 2020-03-26 8662 11.44222222 0.0 0.00 6.5
## 84: 2020-03-27 9196 9.69944444 0.0 0.00 6.2
## 85: 2020-03-28 8267 3.41638889 0.0 0.00 5.3
## 86: 2020-03-30 4249 7.36500000 0.1 0.00 0.1
## 87: 2020-03-31 8291 10.23666667 0.0 0.00 3.4
## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or:
# setkey(rev_data, DATE)
# rev_data[weather_data]
# dplyr
# rev_data %>%
# right_join(weather_data, by = "DATE")
# full join
# data.table
merge(rev_data, weather_data, all = TRUE, by = "DATE")## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-01-01 7637 0.00000000 0.0 0.00 0.0
## 2: 2020-01-02 9276 0.37250000 0.0 0.00 5.8
## 3: 2020-01-03 11170 0.26361111 0.0 0.00 5.4
## 4: 2020-01-04 11863 3.54861111 0.0 0.00 2.4
## 5: 2020-01-05 10880 2.51250000 2.4 0.00 0.1
## 6: 2020-01-06 6702 0.09333333 0.2 0.00 4.8
## 7: 2020-01-07 8652 0.00000000 0.0 0.00 5.5
## 8: 2020-01-08 8346 0.16166667 0.0 0.00 6.5
## 9: 2020-01-09 6543 0.31944444 0.6 0.00 3.9
## 10: 2020-01-10 8115 1.58694444 0.0 0.00 1.0
## 11: 2020-01-11 7728 0.00000000 0.2 0.00 1.2
## 12: 2020-01-12 10649 0.44194444 0.0 0.00 5.9
## 13: 2020-01-13 6787 2.42083333 0.0 0.00 2.6
## 14: 2020-01-14 4555 0.00000000 7.4 0.00 3.8
## 15: 2020-01-15 5885 0.00000000 0.0 0.00 7.8
## 16: 2020-01-16 10127 0.85583333 0.0 0.00 6.1
## 17: 2020-01-17 8893 0.00000000 0.0 0.00 4.8
## 18: 2020-01-18 12520 0.05500000 0.0 0.00 4.8
## 19: 2020-01-19 11860 6.18555556 0.0 0.00 3.1
## 20: 2020-01-20 8515 0.77305556 0.0 0.00 3.8
## 21: 2020-01-21 8129 2.09777778 0.0 0.00 6.8
## 22: 2020-01-22 10405 3.55250000 0.0 0.00 3.2
## 23: 2020-01-23 6672 2.02694444 0.0 0.00 1.5
## 24: 2020-01-24 12300 4.97361111 0.0 0.00 5.1
## 25: 2020-01-25 10651 4.45861111 0.0 0.00 2.1
## 26: 2020-01-26 11882 0.00000000 0.0 0.00 5.8
## 27: 2020-01-27 9397 0.00000000 0.6 0.00 4.8
## 28: 2020-01-28 5174 0.00000000 10.9 0.00 3.4
## 29: 2020-01-29 4436 0.00000000 1.5 0.00 3.3
## 30: 2020-01-30 6202 0.00000000 0.2 0.00 2.4
## 31: 2020-01-31 9949 0.00000000 2.0 0.00 3.0
## 32: 2020-02-01 10192 1.50750000 1.5 0.00 6.1
## 33: 2020-02-02 13904 6.70888889 0.6 0.00 3.8
## 34: 2020-02-03 11208 6.85888889 0.0 0.00 0.5
## 35: 2020-02-04 8578 0.09138889 0.2 0.00 -1.3
## 36: 2020-02-05 6638 NA NA NA NA
## 37: 2020-02-06 7093 NA NA NA NA
## 38: 2020-02-07 8187 1.15611111 0.0 0.00 0.1
## 39: 2020-02-08 10099 0.00000000 0.8 0.00 2.4
## 40: 2020-02-09 8160 1.14805556 6.4 0.00 4.5
## 41: 2020-02-10 3797 0.23944444 0.7 0.00 5.7
## 42: 2020-02-11 8734 0.69166667 0.0 0.00 4.0
## 43: 2020-02-12 4355 1.69333333 1.5 0.00 3.2
## 44: 2020-02-13 8452 8.53333333 0.0 0.00 1.0
## 45: 2020-02-14 7367 6.66250000 0.0 0.00 0.8
## 46: 2020-02-15 9339 0.01416667 4.6 0.00 3.8
## 47: 2020-02-16 5427 0.00000000 1.5 0.00 7.1
## 48: 2020-02-17 7022 1.55083333 0.1 0.00 6.6
## 49: 2020-02-18 8417 2.58805556 0.0 0.00 6.6
## 50: 2020-02-19 8760 7.62638889 0.0 0.00 4.5
## 51: 2020-02-20 4936 3.16527778 5.0 0.00 2.8
## 52: 2020-02-21 11273 7.49055556 0.0 0.00 5.7
## 53: 2020-02-22 7046 0.20055556 0.8 0.00 6.5
## 54: 2020-02-23 10074 6.78666667 0.0 0.00 5.2
## 55: 2020-02-24 9114 8.94694444 0.0 0.00 2.3
## 56: 2020-02-25 4493 2.53583333 1.1 0.00 1.4
## 57: 2020-02-26 4992 0.51944444 0.5 0.00 0.2
## 58: 2020-02-27 3789 0.00000000 0.8 0.01 -1.3
## 59: 2020-02-28 7480 6.55861111 0.2 0.05 -1.3
## 60: 2020-02-29 8358 2.25750000 4.5 0.05 1.1
## 61: 2020-03-01 NA 0.91555556 1.0 0.00 4.2
## 62: 2020-03-02 6489 6.01888889 4.0 0.00 4.4
## 63: 2020-03-03 3586 0.00000000 7.2 0.00 3.7
## 64: 2020-03-04 2570 0.00000000 6.4 0.00 2.1
## 65: 2020-03-05 5059 0.04666667 0.1 0.00 2.1
## 66: 2020-03-06 9087 9.34805556 0.0 0.00 2.0
## 67: 2020-03-07 11570 2.85055556 0.0 0.00 1.9
## 68: 2020-03-08 10476 2.89916667 0.0 0.00 4.5
## 69: 2020-03-09 8977 3.60500000 2.0 0.00 6.8
## 70: 2020-03-10 4509 NA NA NA NA
## 71: 2020-03-11 5837 0.13555556 0.0 0.00 5.6
## 72: 2020-03-12 3559 0.66138889 15.2 0.00 4.9
## 73: 2020-03-13 7489 5.24972222 0.1 0.00 2.1
## 74: 2020-03-14 8138 11.19111111 0.0 0.00 -0.9
## 75: 2020-03-15 8876 5.43888889 2.1 0.00 2.0
## 76: 2020-03-16 4800 6.09472222 2.9 0.00 4.1
## 77: 2020-03-17 3836 0.59083333 0.0 0.00 5.7
## 78: 2020-03-18 2750 1.32916667 5.8 0.00 6.3
## 79: 2020-03-19 7715 11.57500000 0.1 0.00 5.5
## 80: 2020-03-20 4364 3.21694444 0.1 0.00 2.1
## 81: 2020-03-21 9393 10.37333333 0.0 0.00 0.0
## 82: 2020-03-22 7830 11.13527778 0.0 0.00 0.1
## 83: 2020-03-23 4858 6.43416667 0.0 0.00 2.8
## 84: 2020-03-24 5140 5.35000000 0.0 0.00 4.2
## 85: 2020-03-25 7043 6.95083333 0.0 0.00 7.0
## 86: 2020-03-26 8662 11.44222222 0.0 0.00 6.5
## 87: 2020-03-27 9196 9.69944444 0.0 0.00 6.2
## 88: 2020-03-28 8267 3.41638889 0.0 0.00 5.3
## 89: 2020-03-29 8237 NA NA NA NA
## 90: 2020-03-30 4249 7.36500000 0.1 0.00 0.1
## 91: 2020-03-31 8291 10.23666667 0.0 0.00 3.4
## DATE REVENUE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# dplyr
# rev_data %>%
# full_join( weather_data, by = "DATE")
# another example
mayordata <- full_join(contributions_split, results_split, by = "LastName")
str(mayordata)## Classes 'tabyl' and 'data.frame': 7 obs. of 3 variables:
## $ LastName : chr "Horrigan" "Neves-Grigg" "Sen" "Sousa" ...
## $ Pct_Local_Contributors: num 0.03582 0.01194 0.00896 0.02985 0.51642 ...
## $ Pct_Vote : num 0.04996 0.01228 0.00926 0.04932 0.54703 ...
head(mayordata)## LastName Pct_Local_Contributors Pct_Vote
## Horrigan 0.035820896 0.049963330
## Neves-Grigg 0.011940299 0.012284562
## Sen 0.008955224 0.009259259
## Sousa 0.029850746 0.049321599
## Spicer 0.516417910 0.547029703
## Stefanini 0.337313433 0.291895856
# preparing the data
filter_data <- weather_data[TEMPERATURE < 0 & DATE %between% c("2020-02-01", "2020-02-29"), ]
head(filter_data)## DATE SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-02-04 0.09138889 0.2 0.00 -1.3
## 2: 2020-02-27 0.00000000 0.8 0.01 -1.3
## 3: 2020-02-28 6.55861111 0.2 0.05 -1.3
# using a semijoin
# data.table
setkey(rev_data, DATE)
rev_data[rev_data[filter_data, which = TRUE]]## DATE REVENUE
## 1: 2020-02-04 8578
## 2: 2020-02-27 3789
## 3: 2020-02-28 7480
# dplyr
# rev_data %>%
# semi_join(filter_data, by = "DATE")
# antijoin
# data.table
setkey(rev_data, DATE)
rev_data[!filter_data]## DATE REVENUE
## 1: 2020-01-01 7637
## 2: 2020-01-02 9276
## 3: 2020-01-03 11170
## 4: 2020-01-04 11863
## 5: 2020-01-05 10880
## 6: 2020-01-06 6702
## 7: 2020-01-07 8652
## 8: 2020-01-08 8346
## 9: 2020-01-09 6543
## 10: 2020-01-10 8115
## 11: 2020-01-11 7728
## 12: 2020-01-12 10649
## 13: 2020-01-13 6787
## 14: 2020-01-14 4555
## 15: 2020-01-15 5885
## 16: 2020-01-16 10127
## 17: 2020-01-17 8893
## 18: 2020-01-18 12520
## 19: 2020-01-19 11860
## 20: 2020-01-20 8515
## 21: 2020-01-21 8129
## 22: 2020-01-22 10405
## 23: 2020-01-23 6672
## 24: 2020-01-24 12300
## 25: 2020-01-25 10651
## 26: 2020-01-26 11882
## 27: 2020-01-27 9397
## 28: 2020-01-28 5174
## 29: 2020-01-29 4436
## 30: 2020-01-30 6202
## 31: 2020-01-31 9949
## 32: 2020-02-01 10192
## 33: 2020-02-02 13904
## 34: 2020-02-03 11208
## 35: 2020-02-05 6638
## 36: 2020-02-06 7093
## 37: 2020-02-07 8187
## 38: 2020-02-08 10099
## 39: 2020-02-09 8160
## 40: 2020-02-10 3797
## 41: 2020-02-11 8734
## 42: 2020-02-12 4355
## 43: 2020-02-13 8452
## 44: 2020-02-14 7367
## 45: 2020-02-15 9339
## 46: 2020-02-16 5427
## 47: 2020-02-17 7022
## 48: 2020-02-18 8417
## 49: 2020-02-19 8760
## 50: 2020-02-20 4936
## 51: 2020-02-21 11273
## 52: 2020-02-22 7046
## 53: 2020-02-23 10074
## 54: 2020-02-24 9114
## 55: 2020-02-25 4493
## 56: 2020-02-26 4992
## 57: 2020-02-29 8358
## 58: 2020-03-02 6489
## 59: 2020-03-03 3586
## 60: 2020-03-04 2570
## 61: 2020-03-05 5059
## 62: 2020-03-06 9087
## 63: 2020-03-07 11570
## 64: 2020-03-08 10476
## 65: 2020-03-09 8977
## 66: 2020-03-10 4509
## 67: 2020-03-11 5837
## 68: 2020-03-12 3559
## 69: 2020-03-13 7489
## 70: 2020-03-14 8138
## 71: 2020-03-15 8876
## 72: 2020-03-16 4800
## 73: 2020-03-17 3836
## 74: 2020-03-18 2750
## 75: 2020-03-19 7715
## 76: 2020-03-20 4364
## 77: 2020-03-21 9393
## 78: 2020-03-22 7830
## 79: 2020-03-23 4858
## 80: 2020-03-24 5140
## 81: 2020-03-25 7043
## 82: 2020-03-26 8662
## 83: 2020-03-27 9196
## 84: 2020-03-28 8267
## 85: 2020-03-29 8237
## 86: 2020-03-30 4249
## 87: 2020-03-31 8291
## DATE REVENUE
# dplyr
# rev_data %>%
# anti_join(filter_data, by = "DATE")# returns column index names in table format
data.frame(colnames(snowdata))## colnames.snowdata.
## 1 Winter
## 2 SnowInches
## 3 SnowMeters
# returns row index numbers in table format
data.frame(as.integer(rownames(snowdata)))## as.integer.rownames.snowdata..
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
## 11 11
## 12 12
## 13 13
## 14 14
## 15 15
## 16 16
## 17 17
## 18 18
## 19 19
## 20 20
## 21 21
## 22 22
## 23 23
## 24 24
## 25 25
## 26 26
## 27 27
## 28 28
## 29 29
## 30 30
## 31 31
## 32 32
## 33 33
## 34 34
## 35 35
## 36 36
## 37 37
## 38 38
## 39 39
## 40 40
## 41 41
## 42 42
## 43 43
## 44 44
## 45 45
## 46 46
## 47 47
## 48 48
## 49 49
## 50 50
## 51 51
## 52 52
## 53 53
## 54 54
## 55 55
## 56 56
## 57 57
## 58 58
## 59 59
## 60 60
## 61 61
## 62 62
## 63 63
## 64 64
## 65 65
## 66 66
## 67 67
## 68 68
## 69 69
## 70 70
## 71 71
## 72 72
## 73 73
## 74 74
## 75 75
## 76 76
# getting a sense of the data set
str(snowdata)## 'data.frame': 76 obs. of 3 variables:
## $ Winter : chr "1940-1941" "1941-1942" "1942-1943" "1943-1944" ...
## $ SnowInches: num 47.8 23.9 45.7 27.7 59.2 50.8 19.4 89.2 37.1 32 ...
## $ SnowMeters: num 1.214 0.607 1.161 0.704 1.504 ...
# showing the number of rows and columns...
dim(snowdata)## [1] 76 3
nrow(snowdata)## [1] 76
ncol(snowdata)## [1] 3
# ...and names
dimnames(snowdata)## [[1]]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76"
##
## [[2]]
## [1] "Winter" "SnowInches" "SnowMeters"
rownames(snowdata)## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76"
colnames(snowdata)## [1] "Winter" "SnowInches" "SnowMeters"
# a brief statistical summary of a data set, run the summary() function
summary(snowdata)## Winter SnowInches SnowMeters
## Length:76 Min. : 9.30 Min. :0.2362
## Class :character 1st Qu.: 27.57 1st Qu.:0.7004
## Mode :character Median : 42.75 Median :1.0858
## Mean : 44.49 Mean :1.1299
## 3rd Qu.: 57.60 3rd Qu.:1.4630
## Max. :110.60 Max. :2.8092
glimpse(snowdata)## Rows: 76
## Columns: 3
## $ Winter <chr> "1940-1941", "1941-1942", "1942-1943", "1943-1944", "1944-1…
## $ SnowInches <dbl> 47.8, 23.9, 45.7, 27.7, 59.2, 50.8, 19.4, 89.2, 37.1, 32.0,…
## $ SnowMeters <dbl> 1.21412, 0.60706, 1.16078, 0.70358, 1.50368, 1.29032, 0.492…
Find and concisely describe the difference between a pair of R
objects with waldo package. One of the first things worth doing
after importing a data set is looking at the first few rows, the last
few rows, and a summary of some basic stats. This can be easily achieved
thanks to the headTail() function of the psych
package. To get a brief statistical summary you can use the
describe() function of the Hmisc package or the
psych package (only works for numeric data). The skimr
package’s skim() function will show information on each
column, including a little histogram for each numeric one.
df1 <- data.frame(X = c(1, 2, 3), Y = c("a", "b", "c"), A = c(3, 4, 5))
df2 <- data.frame(X = c(1, 2, 3, 4), Y = c("A", "b", "c", "d"), Z = c("k", "l", "m", "n"), A = c("3", "4", "5", "6"))
waldo::compare(df1, df2)## `old` is length 3
## `new` is length 4
##
## `names(old)`: "X" "Y" "A"
## `names(new)`: "X" "Y" "Z" "A"
##
## `attr(old, 'row.names')`: 1 2 3
## `attr(new, 'row.names')`: 1 2 3 4
##
## `old$X`: 1 2 3
## `new$X`: 1 2 3 4
##
## `old$Y[2:3]`: "b" "c"
## `new$Y`: "A" "b" "c" "d"
##
## `old$A` is a double vector (3, 4, 5)
## `new$A` is a character vector ('3', '4', '5', '6')
##
## `old$Z` is absent
## `new$Z` is a character vector ('k', 'l', 'm', 'n')
# getting the first and last rows
headTail(snowdata)## Winter SnowInches SnowMeters
## 1 1940-1941 47.8 1.21
## 2 1941-1942 23.9 0.61
## 3 1942-1943 45.7 1.16
## 4 1943-1944 27.7 0.7
## ... <NA> ... ...
## 73 2012-2013 63.4 1.61
## 74 2013-2014 58.9 1.5
## 75 2014-2015 110.6 2.81
## 76 2015-2016 36.2 0.92
# getting statistical info
Hmisc::describe(snowdata)## snowdata
##
## 3 Variables 76 Observations
## --------------------------------------------------------------------------------
## Winter
## n missing distinct
## 76 0 76
##
## lowest : 1940-1941 1941-1942 1942-1943 1943-1944 1944-1945
## highest: 2011-2012 2012-2013 2013-2014 2014-2015 2015-2016
## --------------------------------------------------------------------------------
## SnowInches
## n missing distinct Info Mean Gmd .05 .10
## 76 0 75 1 44.49 24.92 15.05 18.60
## .25 .50 .75 .90 .95
## 27.58 42.75 57.60 75.95 87.25
##
## lowest : 9.3 10.3 12.5 14.9 15.1, highest: 86.6 89.2 96.3 107.6 110.6
## --------------------------------------------------------------------------------
## SnowMeters
## n missing distinct Info Mean Gmd .05 .10
## 76 0 75 1 1.13 0.6329 0.3823 0.4724
## .25 .50 .75 .90 .95
## 0.7004 1.0858 1.4630 1.9291 2.2161
##
## lowest : 0.23622 0.26162 0.31750 0.37846 0.38354
## highest: 2.19964 2.26568 2.44602 2.73304 2.80924
## --------------------------------------------------------------------------------
psych::describe(snowdata)## vars n mean sd median trimmed mad min max range skew
## Winter* 1 76 38.50 22.08 38.50 38.50 28.17 1.00 76.00 75.00 0.00
## SnowInches 2 76 44.49 22.51 42.75 42.37 22.54 9.30 110.60 101.30 0.84
## SnowMeters 3 76 1.13 0.57 1.09 1.08 0.57 0.24 2.81 2.57 0.84
## kurtosis se
## Winter* -1.25 2.53
## SnowInches 0.46 2.58
## SnowMeters 0.46 0.07
skim(snowdata)| Name | snowdata |
| Number of rows | 76 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Winter | 0 | 1 | 9 | 9 | 0 | 76 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| SnowInches | 0 | 1 | 44.49 | 22.51 | 9.30 | 27.58 | 42.75 | 57.60 | 110.60 | ▆▇▅▂▁ |
| SnowMeters | 0 | 1 | 1.13 | 0.57 | 0.24 | 0.70 | 1.09 | 1.46 | 2.81 | ▆▇▅▂▁ |
For objects that contain more than one element (vectors, matrices, arrays, data frames, and lists), subscripting is used to access some or all of those elements. Besides the usual numeric subscripts, R allows the use of character or logical values for subscripting. Subscripting operations are very fast and efficient, and are often the most powerful tool for accessing and manipulating data in R.
Like most computer languages, numeric subscripts can be used to
access the elements of a vector, array, or list. The first element of an
object has subscript 1; subscripts of 0 are silently ignored. In
addition to a single number, a vector of subscripts (or, for example, a
function call that returns a vector of subscripts) can be used to access
multiple elements. The colon operator and the seq function
are especially useful here. Negative subscripts in R extract all of the
elements of an object except the ones specified in the negative
subscript; thus, when using numeric subscripts, subscripts must be
either all positive (or zero) or all negative (or zero).
If a subscriptable object is named, a character string or vector of
character strings can be used as a subscript. Negative character
subscripts are not permitted; if you need to exclude elements based on
their names, the grep function can be used.
Logical values can be used to selectively access elements of a
subscriptable object, provided the size of the logical object is the
same as the object (or part of the object) that is being subscripted.
Elements corresponding to TRUE values in the logical vector will be
included, and objects corresponding to FALSE values will not. Logical
subscripting provides a very powerful and simple way to perform tasks
that might otherwise require loops. Like most operations in R, logical
operators are vectorized; applying a logical subscript to a vector or an
array will produce an object of the same size and shape as the original
object. To find the indices of elements, R provides the
which function, which accepts a logical vector, and returns
a vector containing the subscripts of the elements for which the
logical vector was true. Logical subscripts allow for
modification of elements that meet a particular condition by using an
appropriately subscripted object on the left-hand side of an assignment
statement.
Lists are the most general way to store a collection of objects in R, because there is no limitation on the mode of the objects that a list may hold. Although it hasn”t been explicitly stated, one rule of subscripting in R is that subscripting will always return an object of the same mode as the object being subscripted. For matrices and vectors, this is completely natural, and should never cause confusion. But for lists, there is a subtle distinction between part of a list, and the object which that part of the list represents.
If the elements of the list are named, the actual contents of the elements can be accessed by separating the name of the list from the name of the element with a dollar sign ($). For interactive sessions, using the dollar sign notation is the natural way to perform operations on the elements of a list. For those situations where the dollar sign notation would be inappropriate (for example, accessing elements through their index or through a name stored in a character variable), R provides the double bracket subscript operator. Double brackets are not restricted to respect the mode of the object they are subscripting, and will extract the actual list element from the list.
The key thing to notice is that in this case, single brackets will always return a list containing the selected element(s), while double brackets will return the actual contents of selected list element.
Since data frames are a cross between a list and a matrix, it”s not
surprising that both matrix and list subscripting techniques apply to
data frames. When using logical subscripts with data frames containing
missing values, it may be necessary to remove the missing values before
the logical comparison is made, or unexpected results may occur. This
situation is so common that R provides the subset function
which accepts a data frame, matrix or vector, and a logical expression
as its first two arguments, and which returns a similar object
containing only those elements that meet the condition of the logical
expression. It insures that missing values don”t get included, and, if
its first argument is a data frame or matrix with named columns, it also
resolves variable names inside the logical expression from the object
passed as the first argument. A further convenience is offered by the
select= argument which will extract only the specified
columns from the data frame passed as the first argument. The argument
to select= is a vector of integers or variable names which
correspond to the columns that are to be extracted. Unlike most other
functions in R, names passed through the select= argument
can be either quoted or unquoted. To ignore columns, their name or index
number can be preceded by a negative sign (-). Since the
select= argument works by replacing variable names with
their corresponding column indices, ranges of columns can be specified
using variable names. The subset function will always
return a new data frame, matrix or vector, so it is not suited
for modifying selected parts of a data frame.
To sort the elements of an object, use the sort
function. Add the decreasing=TRUE option to sort in reverse
order. You can control the treatment of NA values by setting the
na.last argument. To sort a data frame, you need to create
a permutation of the indices from the data frame and use these to fetch
the rows of the data frame in the correct order. You can generate an
appropriate permutation of the indices using the order
function. The order function takes a set of vectors as
arguments. It sorts recursively by each vector, breaking ties by looking
at successive vectors in the argument list. At the end, it returns a
permutation of the indices of the vector corresponding to the sorted
order.
# subsetting a list
simple <- list(a = c("fred", "sam", "harry"), b = c(24, 17, 19, 22))
mode(simple)## [1] "list"
simple[2]## $b
## [1] 24 17 19 22
mode(simple[2])## [1] "list"
simple$b## [1] 24 17 19 22
mean(simple$b)## [1] 20.5
mean(simple[[2]])## [1] 20.5
# single brackets return a list
simple[1]## $a
## [1] "fred" "sam" "harry"
# double brackets return the actual contents of selected list element
simple[[1]]## [1] "fred" "sam" "harry"
# subsetting a data frame
# First, we check the order of the columns:
data.frame(names(airquality))## names.airquality.
## 1 Ozone
## 2 Solar.R
## 3 Wind
## 4 Temp
## 5 Month
## 6 Day
airquality[5, 4] # The 5th element from the 4th column,## [1] 56
# i.e. the same as airquality$Temp[5]
airquality[5,] # The 5th row of the data## Ozone Solar.R Wind Temp Month Day
## 5 NA NA 14.3 56 5 5
airquality[, 4] # The 4th column of the data, like airquality$Temp## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
airquality[[4]] # The 4th column of the data, like airquality$Temp## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
airquality[, c(2, 4, 6)] # The 2nd, 4th and 6th columns of the data## Solar.R Temp Day
## 1 190 67 1
## 2 118 72 2
## 3 149 74 3
## 4 313 62 4
## 5 NA 56 5
## 6 NA 66 6
## 7 299 65 7
## 8 99 59 8
## 9 19 61 9
## 10 194 69 10
## 11 NA 74 11
## 12 256 69 12
## 13 290 66 13
## 14 274 68 14
## 15 65 58 15
## 16 334 64 16
## 17 307 66 17
## 18 78 57 18
## 19 322 68 19
## 20 44 62 20
## 21 8 59 21
## 22 320 73 22
## 23 25 61 23
## 24 92 61 24
## 25 66 57 25
## 26 266 58 26
## 27 NA 57 27
## 28 13 67 28
## 29 252 81 29
## 30 223 79 30
## 31 279 76 31
## 32 286 78 1
## 33 287 74 2
## 34 242 67 3
## 35 186 84 4
## 36 220 85 5
## 37 264 79 6
## 38 127 82 7
## 39 273 87 8
## 40 291 90 9
## 41 323 87 10
## 42 259 93 11
## 43 250 92 12
## 44 148 82 13
## 45 332 80 14
## 46 322 79 15
## 47 191 77 16
## 48 284 72 17
## 49 37 65 18
## 50 120 73 19
## 51 137 76 20
## 52 150 77 21
## 53 59 76 22
## 54 91 76 23
## 55 250 76 24
## 56 135 75 25
## 57 127 78 26
## 58 47 73 27
## 59 98 80 28
## 60 31 77 29
## 61 138 83 30
## 62 269 84 1
## 63 248 85 2
## 64 236 81 3
## 65 101 84 4
## 66 175 83 5
## 67 314 83 6
## 68 276 88 7
## 69 267 92 8
## 70 272 92 9
## 71 175 89 10
## 72 139 82 11
## 73 264 73 12
## 74 175 81 13
## 75 291 91 14
## 76 48 80 15
## 77 260 81 16
## 78 274 82 17
## 79 285 84 18
## 80 187 87 19
## 81 220 85 20
## 82 7 74 21
## 83 258 81 22
## 84 295 82 23
## 85 294 86 24
## 86 223 85 25
## 87 81 82 26
## 88 82 86 27
## 89 213 88 28
## 90 275 86 29
## 91 253 83 30
## 92 254 81 31
## 93 83 81 1
## 94 24 81 2
## 95 77 82 3
## 96 NA 86 4
## 97 NA 85 5
## 98 NA 87 6
## 99 255 89 7
## 100 229 90 8
## 101 207 90 9
## 102 222 92 10
## 103 137 86 11
## 104 192 86 12
## 105 273 82 13
## 106 157 80 14
## 107 64 79 15
## 108 71 77 16
## 109 51 79 17
## 110 115 76 18
## 111 244 78 19
## 112 190 78 20
## 113 259 77 21
## 114 36 72 22
## 115 255 75 23
## 116 212 79 24
## 117 238 81 25
## 118 215 86 26
## 119 153 88 27
## 120 203 97 28
## 121 225 94 29
## 122 237 96 30
## 123 188 94 31
## 124 167 91 1
## 125 197 92 2
## 126 183 93 3
## 127 189 93 4
## 128 95 87 5
## 129 92 84 6
## 130 252 80 7
## 131 220 78 8
## 132 230 75 9
## 133 259 73 10
## 134 236 81 11
## 135 259 76 12
## 136 238 77 13
## 137 24 71 14
## 138 112 71 15
## 139 237 78 16
## 140 224 67 17
## 141 27 76 18
## 142 238 68 19
## 143 201 82 20
## 144 238 64 21
## 145 14 71 22
## 146 139 81 23
## 147 49 69 24
## 148 20 63 25
## 149 193 70 26
## 150 145 77 27
## 151 191 75 28
## 152 131 76 29
## 153 223 68 30
airquality[, -2] # All columns except the 2nd one## Ozone Wind Temp Month Day
## 1 41 7.4 67 5 1
## 2 36 8.0 72 5 2
## 3 12 12.6 74 5 3
## 4 18 11.5 62 5 4
## 5 NA 14.3 56 5 5
## 6 28 14.9 66 5 6
## 7 23 8.6 65 5 7
## 8 19 13.8 59 5 8
## 9 8 20.1 61 5 9
## 10 NA 8.6 69 5 10
## 11 7 6.9 74 5 11
## 12 16 9.7 69 5 12
## 13 11 9.2 66 5 13
## 14 14 10.9 68 5 14
## 15 18 13.2 58 5 15
## 16 14 11.5 64 5 16
## 17 34 12.0 66 5 17
## 18 6 18.4 57 5 18
## 19 30 11.5 68 5 19
## 20 11 9.7 62 5 20
## 21 1 9.7 59 5 21
## 22 11 16.6 73 5 22
## 23 4 9.7 61 5 23
## 24 32 12.0 61 5 24
## 25 NA 16.6 57 5 25
## 26 NA 14.9 58 5 26
## 27 NA 8.0 57 5 27
## 28 23 12.0 67 5 28
## 29 45 14.9 81 5 29
## 30 115 5.7 79 5 30
## 31 37 7.4 76 5 31
## 32 NA 8.6 78 6 1
## 33 NA 9.7 74 6 2
## 34 NA 16.1 67 6 3
## 35 NA 9.2 84 6 4
## 36 NA 8.6 85 6 5
## 37 NA 14.3 79 6 6
## 38 29 9.7 82 6 7
## 39 NA 6.9 87 6 8
## 40 71 13.8 90 6 9
## 41 39 11.5 87 6 10
## 42 NA 10.9 93 6 11
## 43 NA 9.2 92 6 12
## 44 23 8.0 82 6 13
## 45 NA 13.8 80 6 14
## 46 NA 11.5 79 6 15
## 47 21 14.9 77 6 16
## 48 37 20.7 72 6 17
## 49 20 9.2 65 6 18
## 50 12 11.5 73 6 19
## 51 13 10.3 76 6 20
## 52 NA 6.3 77 6 21
## 53 NA 1.7 76 6 22
## 54 NA 4.6 76 6 23
## 55 NA 6.3 76 6 24
## 56 NA 8.0 75 6 25
## 57 NA 8.0 78 6 26
## 58 NA 10.3 73 6 27
## 59 NA 11.5 80 6 28
## 60 NA 14.9 77 6 29
## 61 NA 8.0 83 6 30
## 62 135 4.1 84 7 1
## 63 49 9.2 85 7 2
## 64 32 9.2 81 7 3
## 65 NA 10.9 84 7 4
## 66 64 4.6 83 7 5
## 67 40 10.9 83 7 6
## 68 77 5.1 88 7 7
## 69 97 6.3 92 7 8
## 70 97 5.7 92 7 9
## 71 85 7.4 89 7 10
## 72 NA 8.6 82 7 11
## 73 10 14.3 73 7 12
## 74 27 14.9 81 7 13
## 75 NA 14.9 91 7 14
## 76 7 14.3 80 7 15
## 77 48 6.9 81 7 16
## 78 35 10.3 82 7 17
## 79 61 6.3 84 7 18
## 80 79 5.1 87 7 19
## 81 63 11.5 85 7 20
## 82 16 6.9 74 7 21
## 83 NA 9.7 81 7 22
## 84 NA 11.5 82 7 23
## 85 80 8.6 86 7 24
## 86 108 8.0 85 7 25
## 87 20 8.6 82 7 26
## 88 52 12.0 86 7 27
## 89 82 7.4 88 7 28
## 90 50 7.4 86 7 29
## 91 64 7.4 83 7 30
## 92 59 9.2 81 7 31
## 93 39 6.9 81 8 1
## 94 9 13.8 81 8 2
## 95 16 7.4 82 8 3
## 96 78 6.9 86 8 4
## 97 35 7.4 85 8 5
## 98 66 4.6 87 8 6
## 99 122 4.0 89 8 7
## 100 89 10.3 90 8 8
## 101 110 8.0 90 8 9
## 102 NA 8.6 92 8 10
## 103 NA 11.5 86 8 11
## 104 44 11.5 86 8 12
## 105 28 11.5 82 8 13
## 106 65 9.7 80 8 14
## 107 NA 11.5 79 8 15
## 108 22 10.3 77 8 16
## 109 59 6.3 79 8 17
## 110 23 7.4 76 8 18
## 111 31 10.9 78 8 19
## 112 44 10.3 78 8 20
## 113 21 15.5 77 8 21
## 114 9 14.3 72 8 22
## 115 NA 12.6 75 8 23
## 116 45 9.7 79 8 24
## 117 168 3.4 81 8 25
## 118 73 8.0 86 8 26
## 119 NA 5.7 88 8 27
## 120 76 9.7 97 8 28
## 121 118 2.3 94 8 29
## 122 84 6.3 96 8 30
## 123 85 6.3 94 8 31
## 124 96 6.9 91 9 1
## 125 78 5.1 92 9 2
## 126 73 2.8 93 9 3
## 127 91 4.6 93 9 4
## 128 47 7.4 87 9 5
## 129 32 15.5 84 9 6
## 130 20 10.9 80 9 7
## 131 23 10.3 78 9 8
## 132 21 10.9 75 9 9
## 133 24 9.7 73 9 10
## 134 44 14.9 81 9 11
## 135 21 15.5 76 9 12
## 136 28 6.3 77 9 13
## 137 9 10.9 71 9 14
## 138 13 11.5 71 9 15
## 139 46 6.9 78 9 16
## 140 18 13.8 67 9 17
## 141 13 10.3 76 9 18
## 142 24 10.3 68 9 19
## 143 16 8.0 82 9 20
## 144 13 12.6 64 9 21
## 145 23 9.2 71 9 22
## 146 36 10.3 81 9 23
## 147 7 10.3 69 9 24
## 148 14 16.6 63 9 25
## 149 30 6.9 70 9 26
## 150 NA 13.2 77 9 27
## 151 14 14.3 75 9 28
## 152 18 8.0 76 9 29
## 153 20 11.5 68 9 30
airquality[, c("Temp", "Wind")] # The Temp and Wind columns## Temp Wind
## 1 67 7.4
## 2 72 8.0
## 3 74 12.6
## 4 62 11.5
## 5 56 14.3
## 6 66 14.9
## 7 65 8.6
## 8 59 13.8
## 9 61 20.1
## 10 69 8.6
## 11 74 6.9
## 12 69 9.7
## 13 66 9.2
## 14 68 10.9
## 15 58 13.2
## 16 64 11.5
## 17 66 12.0
## 18 57 18.4
## 19 68 11.5
## 20 62 9.7
## 21 59 9.7
## 22 73 16.6
## 23 61 9.7
## 24 61 12.0
## 25 57 16.6
## 26 58 14.9
## 27 57 8.0
## 28 67 12.0
## 29 81 14.9
## 30 79 5.7
## 31 76 7.4
## 32 78 8.6
## 33 74 9.7
## 34 67 16.1
## 35 84 9.2
## 36 85 8.6
## 37 79 14.3
## 38 82 9.7
## 39 87 6.9
## 40 90 13.8
## 41 87 11.5
## 42 93 10.9
## 43 92 9.2
## 44 82 8.0
## 45 80 13.8
## 46 79 11.5
## 47 77 14.9
## 48 72 20.7
## 49 65 9.2
## 50 73 11.5
## 51 76 10.3
## 52 77 6.3
## 53 76 1.7
## 54 76 4.6
## 55 76 6.3
## 56 75 8.0
## 57 78 8.0
## 58 73 10.3
## 59 80 11.5
## 60 77 14.9
## 61 83 8.0
## 62 84 4.1
## 63 85 9.2
## 64 81 9.2
## 65 84 10.9
## 66 83 4.6
## 67 83 10.9
## 68 88 5.1
## 69 92 6.3
## 70 92 5.7
## 71 89 7.4
## 72 82 8.6
## 73 73 14.3
## 74 81 14.9
## 75 91 14.9
## 76 80 14.3
## 77 81 6.9
## 78 82 10.3
## 79 84 6.3
## 80 87 5.1
## 81 85 11.5
## 82 74 6.9
## 83 81 9.7
## 84 82 11.5
## 85 86 8.6
## 86 85 8.0
## 87 82 8.6
## 88 86 12.0
## 89 88 7.4
## 90 86 7.4
## 91 83 7.4
## 92 81 9.2
## 93 81 6.9
## 94 81 13.8
## 95 82 7.4
## 96 86 6.9
## 97 85 7.4
## 98 87 4.6
## 99 89 4.0
## 100 90 10.3
## 101 90 8.0
## 102 92 8.6
## 103 86 11.5
## 104 86 11.5
## 105 82 11.5
## 106 80 9.7
## 107 79 11.5
## 108 77 10.3
## 109 79 6.3
## 110 76 7.4
## 111 78 10.9
## 112 78 10.3
## 113 77 15.5
## 114 72 14.3
## 115 75 12.6
## 116 79 9.7
## 117 81 3.4
## 118 86 8.0
## 119 88 5.7
## 120 97 9.7
## 121 94 2.3
## 122 96 6.3
## 123 94 6.3
## 124 91 6.9
## 125 92 5.1
## 126 93 2.8
## 127 93 4.6
## 128 87 7.4
## 129 84 15.5
## 130 80 10.9
## 131 78 10.3
## 132 75 10.9
## 133 73 9.7
## 134 81 14.9
## 135 76 15.5
## 136 77 6.3
## 137 71 10.9
## 138 71 11.5
## 139 78 6.9
## 140 67 13.8
## 141 76 10.3
## 142 68 10.3
## 143 82 8.0
## 144 64 12.6
## 145 71 9.2
## 146 81 10.3
## 147 69 10.3
## 148 63 16.6
## 149 70 6.9
## 150 77 13.2
## 151 75 14.3
## 152 76 8.0
## 153 68 11.5
age <- c(28, 48, 47, 71, 22, 80, 48, 30, 31)
purchase <- c(20, 59, 2, 12, 22, 160, 34, 34, 29)
bookstore <- data.frame(age, purchase)
bookstore$age[2] <- 18
# or
bookstore[2, 1] <- 18
# subsetting with logical values
nums = c(12,9,8,14,7,16,3,2,9)
nums > 10## [1] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
nums[nums > 10]## [1] 12 14 16
which(nums > 10)## [1] 1 4 6
# equal to
seq(along = nums)[nums > 10]## [1] 1 4 6
which.max(airquality$Temp)## [1] 120
airquality[which.max(airquality$Temp),]## Ozone Solar.R Wind Temp Month Day
## 120 76 203 9.7 97 8 28
airquality[airquality$Temp > 90, ]## Ozone Solar.R Wind Temp Month Day
## 42 NA 259 10.9 93 6 11
## 43 NA 250 9.2 92 6 12
## 69 97 267 6.3 92 7 8
## 70 97 272 5.7 92 7 9
## 75 NA 291 14.9 91 7 14
## 102 NA 222 8.6 92 8 10
## 120 76 203 9.7 97 8 28
## 121 118 225 2.3 94 8 29
## 122 84 237 6.3 96 8 30
## 123 85 188 6.3 94 8 31
## 124 96 167 6.9 91 9 1
## 125 78 197 5.1 92 9 2
## 126 73 183 2.8 93 9 3
## 127 91 189 4.6 93 9 4
# knowing if all elements in a vector fulfill the condition
all(airquality$Temp > 90)## [1] FALSE
# knowing whether at least one element in a vector fulfill the condition
any(airquality$Temp > 90)## [1] TRUE
# finding how many elements that fulfill a condition
sum(airquality$Temp > 90)## [1] 14
# modifying elements through logical subscriptions
nums[nums > 10] <- 0
nums## [1] 0 9 8 0 7 0 3 2 9
dd <- data.frame(a = c(5, 9, 12, 15, 17, 11), b = c(8, NA, 12, 10, NA, 15))
dd[dd$b > 10, ]## a b
## NA NA NA
## 3 12 12
## NA.1 NA NA
## 6 11 15
bookstore$visit_length <- c(5, 2, 20, 22, 12, 31, 9, 10, 11)
bookstore## age purchase visit_length
## 1 28 20 5
## 2 18 59 2
## 3 47 2 20
## 4 71 12 22
## 5 22 22 12
## 6 80 160 31
## 7 48 34 9
## 8 30 34 10
## 9 31 29 11
# storing the TRUE or FALSE values in a new variable
airquality$Hot <- airquality$Temp > 90
airquality$Hot## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# filtering and creating a new variable
temp_may <- airquality$Temp[airquality$Month == 5]
temp_june <- airquality$Temp[airquality$Month == 6]
# splitting vectors into lists
temps <- split(airquality$Temp, airquality$Month)
names(temps) <- c("May", "June", "July", "August", "September")
temps## $May
## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76
##
## $June
## [1] 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73 76 77 76 76 76 75
## [26] 78 73 80 77 83
##
## $July
## [1] 84 85 81 84 83 83 88 92 92 89 82 73 81 91 80 81 82 84 87 85 74 81 82 86 85
## [26] 82 86 88 86 83 81
##
## $August
## [1] 81 81 82 86 85 87 89 90 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81
## [26] 86 88 97 94 96 94
##
## $September
## [1] 91 92 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63
## [26] 70 77 75 76 68
temps$June## [1] 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73 76 77 76 76 76 75
## [26] 78 73 80 77 83
# collapsing lists into vectors
unlist(temps)## May1 May2 May3 May4 May5 May6
## 67 72 74 62 56 66
## May7 May8 May9 May10 May11 May12
## 65 59 61 69 74 69
## May13 May14 May15 May16 May17 May18
## 66 68 58 64 66 57
## May19 May20 May21 May22 May23 May24
## 68 62 59 73 61 61
## May25 May26 May27 May28 May29 May30
## 57 58 57 67 81 79
## May31 June1 June2 June3 June4 June5
## 76 78 74 67 84 85
## June6 June7 June8 June9 June10 June11
## 79 82 87 90 87 93
## June12 June13 June14 June15 June16 June17
## 92 82 80 79 77 72
## June18 June19 June20 June21 June22 June23
## 65 73 76 77 76 76
## June24 June25 June26 June27 June28 June29
## 76 75 78 73 80 77
## June30 July1 July2 July3 July4 July5
## 83 84 85 81 84 83
## July6 July7 July8 July9 July10 July11
## 83 88 92 92 89 82
## July12 July13 July14 July15 July16 July17
## 73 81 91 80 81 82
## July18 July19 July20 July21 July22 July23
## 84 87 85 74 81 82
## July24 July25 July26 July27 July28 July29
## 86 85 82 86 88 86
## July30 July31 August1 August2 August3 August4
## 83 81 81 81 82 86
## August5 August6 August7 August8 August9 August10
## 85 87 89 90 90 92
## August11 August12 August13 August14 August15 August16
## 86 86 82 80 79 77
## August17 August18 August19 August20 August21 August22
## 79 76 78 78 77 72
## August23 August24 August25 August26 August27 August28
## 75 79 81 86 88 97
## August29 August30 August31 September1 September2 September3
## 94 96 94 91 92 93
## September4 September5 September6 September7 September8 September9
## 93 87 84 80 78 75
## September10 September11 September12 September13 September14 September15
## 73 81 76 77 71 71
## September16 September17 September18 September19 September20 September21
## 78 67 76 68 82 64
## September22 September23 September24 September25 September26 September27
## 71 81 69 63 70 77
## September28 September29 September30
## 75 76 68
# using subset
subset(dd, b > 10)## a b
## 3 12 12
## 6 11 15
some <- subset(LifeCycleSavings, sr > 10, select = c(pop15, pop75))
head(some)## pop15 pop75
## Australia 29.35 2.87
## Austria 23.32 4.41
## Belgium 23.80 4.43
## Brazil 42.19 0.83
## China 44.75 0.67
## Costa Rica 47.64 1.14
life1 <- subset(LifeCycleSavings, select = pop15:dpi)
# or
life1 <- subset(LifeCycleSavings, select = 1:3)
head(life1)## sr pop15 pop75
## Australia 11.43 29.35 2.87
## Austria 12.07 23.32 4.41
## Belgium 13.17 23.80 4.43
## Bolivia 5.75 41.89 1.67
## Brazil 12.88 42.19 0.83
## Canada 8.79 31.72 2.85
life2 <- subset(LifeCycleSavings, select = c(-pop15, -pop75))
# or
life2 <- subset(LifeCycleSavings, select = -c(2, 3))
head(life2)## sr dpi ddpi
## Australia 11.43 2329.68 2.87
## Austria 12.07 1507.99 3.93
## Belgium 13.17 2108.47 3.82
## Bolivia 5.75 189.13 0.22
## Brazil 12.88 728.47 4.56
## Canada 8.79 2982.88 2.43
# sorting with sort()
w <- c(5, 4, 7, 2, 7, 1)
sort(w)## [1] 1 2 4 5 7 7
sort(w, decreasing = TRUE)## [1] 7 7 5 4 2 1
length(w) <- 7
sort(w, na.last = TRUE)## [1] 1 2 4 5 7 7 NA
sort(w, na.last = FALSE)## [1] NA 1 2 4 5 7 7
# using order() to sort a data frame
v <- c(11, 12, 13, 15, 14)
order(v)## [1] 1 2 3 5 4
v[order(v)]## [1] 11 12 13 14 15
u <- c("pig", "cow", "duck", "horse", "rat")
w <- data.frame(v, u)
w## v u
## 1 11 pig
## 2 12 cow
## 3 13 duck
## 4 15 horse
## 5 14 rat
w[order(w$v), ] ## v u
## 1 11 pig
## 2 12 cow
## 3 13 duck
## 5 14 rat
## 4 15 horse
snowdata[order(snowdata$SnowInches), ]## Winter SnowInches SnowMeters
## 72 2011-2012 9.3 0.23622
## 33 1972-1973 10.3 0.26162
## 40 1979-1980 12.5 0.31750
## 55 1994-1995 14.9 0.37846
## 62 2001-2002 15.1 0.38354
## 49 1988-1989 15.5 0.39370
## 67 2006-2007 17.1 0.43434
## 46 1985-1986 18.1 0.45974
## 51 1990-1991 19.1 0.48514
## 7 1946-1947 19.4 0.49276
## 52 1991-1992 22.0 0.55880
## 41 1980-1981 22.3 0.56642
## 14 1953-1954 23.6 0.59944
## 2 1941-1942 23.9 0.60706
## 60 1999-2000 24.4 0.61976
## 15 1954-1955 25.1 0.63754
## 58 1997-1998 25.6 0.65024
## 45 1984-1985 26.6 0.67564
## 39 1978-1979 27.5 0.69850
## 35 1974-1975 27.6 0.70104
## 4 1943-1944 27.7 0.70358
## 11 1950-1951 29.7 0.75438
## 13 1952-1953 29.8 0.75692
## 23 1962-1963 30.9 0.78486
## 12 1951-1952 31.9 0.81026
## 10 1949-1950 32.0 0.81280
## 43 1982-1983 32.7 0.83058
## 19 1958-1959 34.1 0.86614
## 70 2009-2010 35.7 0.90678
## 76 2015-2016 36.2 0.91948
## 59 1998-1999 36.4 0.92456
## 34 1973-1974 36.9 0.93726
## 9 1948-1949 37.1 0.94234
## 50 1989-1990 39.2 0.99568
## 64 2003-2004 39.4 1.00076
## 66 2005-2006 39.9 1.01346
## 20 1959-1960 40.9 1.03886
## 47 1986-1987 42.5 1.07950
## 44 1983-1984 43.0 1.09220
## 26 1965-1966 44.1 1.12014
## 18 1957-1958 44.7 1.13538
## 22 1961-1962 44.7 1.13538
## 28 1967-1968 44.8 1.13792
## 3 1942-1943 45.7 1.16078
## 61 2000-2001 45.9 1.16586
## 36 1975-1976 46.6 1.18364
## 32 1971-1972 47.5 1.20650
## 1 1940-1941 47.8 1.21412
## 30 1969-1970 48.8 1.23952
## 25 1964-1965 50.4 1.28016
## 6 1945-1946 50.8 1.29032
## 68 2007-2008 51.2 1.30048
## 57 1996-1997 51.9 1.31826
## 17 1956-1957 52.0 1.32080
## 48 1987-1988 52.6 1.33604
## 29 1968-1969 53.8 1.36652
## 31 1970-1971 57.3 1.45542
## 37 1976-1977 58.5 1.48590
## 74 2013-2014 58.9 1.49606
## 5 1944-1945 59.2 1.50368
## 27 1966-1967 60.1 1.52654
## 16 1955-1956 60.9 1.54686
## 21 1960-1961 61.5 1.56210
## 42 1981-1982 61.8 1.56972
## 24 1963-1964 63.0 1.60020
## 73 2012-2013 63.4 1.61036
## 69 2008-2009 65.9 1.67386
## 63 2002-2003 70.9 1.80086
## 71 2010-2011 81.0 2.05740
## 53 1992-1993 83.9 2.13106
## 38 1977-1978 85.1 2.16154
## 65 2004-2005 86.6 2.19964
## 8 1947-1948 89.2 2.26568
## 54 1993-1994 96.3 2.44602
## 56 1995-1996 107.6 2.73304
## 75 2014-2015 110.6 2.80924
snowdata[order(snowdata$SnowInches, snowdata$SnowMeters), ]## Winter SnowInches SnowMeters
## 72 2011-2012 9.3 0.23622
## 33 1972-1973 10.3 0.26162
## 40 1979-1980 12.5 0.31750
## 55 1994-1995 14.9 0.37846
## 62 2001-2002 15.1 0.38354
## 49 1988-1989 15.5 0.39370
## 67 2006-2007 17.1 0.43434
## 46 1985-1986 18.1 0.45974
## 51 1990-1991 19.1 0.48514
## 7 1946-1947 19.4 0.49276
## 52 1991-1992 22.0 0.55880
## 41 1980-1981 22.3 0.56642
## 14 1953-1954 23.6 0.59944
## 2 1941-1942 23.9 0.60706
## 60 1999-2000 24.4 0.61976
## 15 1954-1955 25.1 0.63754
## 58 1997-1998 25.6 0.65024
## 45 1984-1985 26.6 0.67564
## 39 1978-1979 27.5 0.69850
## 35 1974-1975 27.6 0.70104
## 4 1943-1944 27.7 0.70358
## 11 1950-1951 29.7 0.75438
## 13 1952-1953 29.8 0.75692
## 23 1962-1963 30.9 0.78486
## 12 1951-1952 31.9 0.81026
## 10 1949-1950 32.0 0.81280
## 43 1982-1983 32.7 0.83058
## 19 1958-1959 34.1 0.86614
## 70 2009-2010 35.7 0.90678
## 76 2015-2016 36.2 0.91948
## 59 1998-1999 36.4 0.92456
## 34 1973-1974 36.9 0.93726
## 9 1948-1949 37.1 0.94234
## 50 1989-1990 39.2 0.99568
## 64 2003-2004 39.4 1.00076
## 66 2005-2006 39.9 1.01346
## 20 1959-1960 40.9 1.03886
## 47 1986-1987 42.5 1.07950
## 44 1983-1984 43.0 1.09220
## 26 1965-1966 44.1 1.12014
## 18 1957-1958 44.7 1.13538
## 22 1961-1962 44.7 1.13538
## 28 1967-1968 44.8 1.13792
## 3 1942-1943 45.7 1.16078
## 61 2000-2001 45.9 1.16586
## 36 1975-1976 46.6 1.18364
## 32 1971-1972 47.5 1.20650
## 1 1940-1941 47.8 1.21412
## 30 1969-1970 48.8 1.23952
## 25 1964-1965 50.4 1.28016
## 6 1945-1946 50.8 1.29032
## 68 2007-2008 51.2 1.30048
## 57 1996-1997 51.9 1.31826
## 17 1956-1957 52.0 1.32080
## 48 1987-1988 52.6 1.33604
## 29 1968-1969 53.8 1.36652
## 31 1970-1971 57.3 1.45542
## 37 1976-1977 58.5 1.48590
## 74 2013-2014 58.9 1.49606
## 5 1944-1945 59.2 1.50368
## 27 1966-1967 60.1 1.52654
## 16 1955-1956 60.9 1.54686
## 21 1960-1961 61.5 1.56210
## 42 1981-1982 61.8 1.56972
## 24 1963-1964 63.0 1.60020
## 73 2012-2013 63.4 1.61036
## 69 2008-2009 65.9 1.67386
## 63 2002-2003 70.9 1.80086
## 71 2010-2011 81.0 2.05740
## 53 1992-1993 83.9 2.13106
## 38 1977-1978 85.1 2.16154
## 65 2004-2005 86.6 2.19964
## 8 1947-1948 89.2 2.26568
## 54 1993-1994 96.3 2.44602
## 56 1995-1996 107.6 2.73304
## 75 2014-2015 110.6 2.80924
# getting the location of the maximum and minimum values
which.max(snowdata$Boston)## integer(0)
which.min(snowdata$Boston)## integer(0)
# using indexes on the rows and/or columns
rows <- Arthritis$Sex == "Female" & Arthritis$Age > 68
cols <- c("Treatment", "Improved")
Arthritis[rows, cols]## Treatment Improved
## 39 Treated None
## 40 Treated Some
## 41 Treated Some
## 84 Placebo Marked
# using names of values of a column to subset and create a new variable (no data)
# btw9s[c("BB", "BE", "MV","SN","ST","TH"), "EW"] <- "East"
# creating new variables based on others and reordering (no data)
# Fleiss93 <- within(Fleiss93, {
# total <- n.e + n.c # create new var based on the sum of two
# st1 <- as.character(study) # change type
# st <- reorder(study, -(total)) # reorder var
# })
# fixing the naming (no data)
# exp1_long$condition <- ifelse(exp1_long$condition == "no", "No_communication",
# ifelse(exp1_long$condition == "go", "High_confidence",
# ifelse(exp1_long$condition == "me", "Medium_confidence",
# ifelse(exp1_long$condition == "ba", "Low_confidence",
# exp1_long$condition
# )
# )
# )
# )
# renaming variables and convert to numeric (no data)
# exp1_long$temperature <- as.numeric(ifelse(exp1_long$temperature == "315", "31.5",
# ifelse(exp1_long$temperature == "325", "32.5",
# ifelse(exp1_long$temperature == "335", "33.5",
# ifelse(exp1_long$temperature == "345", "34.5",
# ifelse(exp1_long$temperature == "355", "35.5",
# ifelse(exp1_long$temperature == "365", "36.5",
# exp1_long$temperature
# )
# )
# )
# )
# )
# ))
# recoding Yes/No responses as numeric (Yes=1, No=0) (no data)
# exp1_long$response_code <- ifelse(exp1_long$response == "Yes", 1, 0)
# adding values by row and getting rid of all NAs (no data)
# df.a$expertise_sum <- rowSums(df.a[, 12:19], na.rm = TRUE)
# cutting numeric variables into categories
age <- c(60, 58, 24, 26, 34, 42, 31, 30, 33, 2, 9)
age.breaks <- seq(from = 0, to = 60, by = 20)
age.labels <- c("young", "adult", "older")
age.group <- cut(x = age, breaks = age.breaks, labels = age.labels)
age.df <- data.frame(age, age.group)
age.df## age age.group
## 1 60 older
## 2 58 older
## 3 24 adult
## 4 26 adult
## 5 34 adult
## 6 42 older
## 7 31 adult
## 8 30 adult
## 9 33 adult
## 10 2 young
## 11 9 young
# creating a temporal seq of years and calculate mean (no data)
# yrs <- c(seq(1972, 1988, 4), 1993, seq(1996, 2016, 4)) # seq every 4 years
# Calculating mean for every year
# mean_age <- gss_lon %>%
# filter(age %nin% NA && year %in% yrs) %>%
# group_by(year) %>%
# summarize(xbar = round(mean(age, na.rm = TRUE), 0))
# splitting a vector by group (no data)
# using split function: x = the variable that needs to be split into groups; y = the grouping variable
# speech.by.char <- split(x = utterance, y = speaker)
# speech.by.char
# pull variables from a list or data frame out and into the workspace
# importList(speech.by.char)
# locating NAs in a data frame
# pos_country <- which(is.na(df$Region))
# replacing NAs in a data frame for the last value with na.locf() (no data)
# replacing NAs for the last value
# df <- mutate(df, Region = zoo::na.locf(Region))filter() picks rows based on data values, but
select() chooses columns based on the column names (not
values of data within the columns).
Select certain rows based on a logical condition - dplyr”s
filter(). To check for one condition OR another condition,
use the | symbol, which means or. You can get the number of
rows in a data frame with nrow(). To filter by one
condition AND a second condition, you can use the &
sign. Select certain rows based on row number - dplyr”s
slice().
Use arrange(dataframe, colname) to sort in ascending
order and arrange(dataframe, desc(colname)) to sort in
descending order. To sort by a second column in case there are ties in
the first column, the syntax is
arrange(dataframe, col1, col2).
You can select by specific column name, no quotes or c()
needed: select(snowdata, Winter, Boston). Select a
contiguous group of columns, such as starting with Boston and ending
with New York City, with the syntax
select(snowdata, Boston:NYC). You can select based on
column names containing certain characters; for example, if you had a
data frame with column names in the format city_state such as Boston_MA,
Chicago_IL, NYC_NY, Fargo_ND and Syracuse_NY, you could select all the
New York State entries using
select(dataframe, contains("_NY")) or
select(dataframe, ends_with("_NY"). You can delete columns
by putting a minus sign before your selection, such as
select(snowdata, -(Boston:Chicago)) or
select(dataframe, -contains("_NY")).
select_if() lets you use is. functions such as
is.numeric() or is.character() to choose
columns by data type.
The tidyselect functions give you many options to
select columns in R. The first function is everything. As
the name suggests, it lets you select all columns of a data frame. Then
we have last_col. With this function you can select the
last column in a data frame. We have the two functions
starts_with and ends_with. You use these
functions when you want to select columns that start or end with exactly
a certain string. starts_with and ends_with
works with any character, but also with a vector of characters.
Next we have the contains function.
contains searches for columns that contain a specific
string. Note that it does not work with regular expressions, but
searches for exactly the string you specify. By default, however, the
function is not case-sensitive. It doesn”t matter if your columns are in
uppercase or lowercase. If you are concerned about case sensitivity, set
the ignore.case argument to FALSE (this also
works with starts_with, ends_with, and
matches). Unlike contains, matches works with
regular expressions.
The function num_range is useful if your column names
follow a certain pattern.
Finally, there is the where function. where
is used when you want to select variables of a certain data type. Other
predicate functions are:
is.doubleis.logicalis.factoris.integerYou can combine the different selection functions with the
& and | operators.
snowdata <- rio::import("input/BostonChicagoNYCSnowfalls.csv")
# knowing how may rows
snowdata[nrow(snowdata), ]## Winter Boston Chicago NYC
## 76 2015-2016 36.2 31.2 32.1
# using filter from dplyr
filter(snowdata, Boston > 100)## Winter Boston Chicago NYC
## 1 1995-1996 107.6 23.9 75.6
## 2 2014-2015 110.6 50.7 50.3
# OR
filter(snowdata, Boston < 20 | Boston > 100)## Winter Boston Chicago NYC
## 1 1946-1947 19.4 34.1 30.6
## 2 1972-1973 10.3 32.9 2.8
## 3 1979-1980 12.5 42.4 12.8
## 4 1985-1986 18.1 29.0 13.0
## 5 1988-1989 15.5 24.5 8.1
## 6 1990-1991 19.1 36.7 24.9
## 7 1994-1995 14.9 24.1 11.8
## 8 1995-1996 107.6 23.9 75.6
## 9 2001-2002 15.1 31.1 3.5
## 10 2006-2007 17.1 35.6 12.4
## 11 2011-2012 9.3 19.8 7.4
## 12 2014-2015 110.6 50.7 50.3
# AND
filter(snowdata, Boston > 40 & Boston < 50)## Winter Boston Chicago NYC
## 1 1940-1941 47.8 52.5 39.0
## 2 1942-1943 45.7 45.2 29.5
## 3 1957-1958 44.7 20.0 44.7
## 4 1959-1960 40.9 50.9 39.2
## 5 1961-1962 44.7 58.9 18.1
## 6 1965-1966 44.1 24.9 21.4
## 7 1967-1968 44.8 28.4 19.5
## 8 1969-1970 48.8 77.0 25.6
## 9 1971-1972 47.5 46.8 22.9
## 10 1975-1976 46.6 43.3 17.3
## 11 1983-1984 43.0 49.0 25.4
## 12 1986-1987 42.5 26.2 23.1
## 13 2000-2001 45.9 39.2 35.0
filter(snowdata, Boston > 50, Boston < 80)## Winter Boston Chicago NYC
## 1 1944-1945 59.2 34.9 27.1
## 2 1945-1946 50.8 23.9 31.4
## 3 1955-1956 60.9 26.3 33.5
## 4 1956-1957 52.0 31.3 21.9
## 5 1960-1961 61.5 40.7 54.7
## 6 1963-1964 63.0 35.2 44.7
## 7 1964-1965 50.4 59.5 24.4
## 8 1966-1967 60.1 68.4 51.5
## 9 1968-1969 53.8 29.4 30.2
## 10 1970-1971 57.3 37.9 15.5
## 11 1976-1977 58.5 54.1 24.5
## 12 1981-1982 61.8 59.3 24.6
## 13 1987-1988 52.6 42.6 19.1
## 14 1996-1997 51.9 40.6 10.0
## 15 2002-2003 70.9 28.6 49.3
## 16 2007-2008 51.2 60.3 11.9
## 17 2008-2009 65.9 52.7 27.6
## 18 2012-2013 63.4 30.1 26.1
## 19 2013-2014 58.9 82.0 57.4
# using slice()
myresults <- slice(snowdata, 60:76)
# arranging data frames
# ascending order
arrange(snowdata, Boston)## Winter Boston Chicago NYC
## 1 2011-2012 9.3 19.8 7.4
## 2 1972-1973 10.3 32.9 2.8
## 3 1979-1980 12.5 42.4 12.8
## 4 1994-1995 14.9 24.1 11.8
## 5 2001-2002 15.1 31.1 3.5
## 6 1988-1989 15.5 24.5 8.1
## 7 2006-2007 17.1 35.6 12.4
## 8 1985-1986 18.1 29.0 13.0
## 9 1990-1991 19.1 36.7 24.9
## 10 1946-1947 19.4 34.1 30.6
## 11 1991-1992 22.0 28.4 12.6
## 12 1980-1981 22.3 35.0 19.4
## 13 1953-1954 23.6 43.2 15.8
## 14 1941-1942 23.9 29.8 11.3
## 15 1999-2000 24.4 30.3 16.3
## 16 1954-1955 25.1 32.2 11.5
## 17 1997-1998 25.6 29.6 5.5
## 18 1984-1985 26.6 39.1 24.1
## 19 1978-1979 27.5 89.7 29.4
## 20 1974-1975 27.6 52.2 13.1
## 21 1943-1944 27.7 24.0 23.8
## 22 1950-1951 29.7 54.4 11.6
## 23 1952-1953 29.8 23.4 15.1
## 24 1962-1963 30.9 42.7 16.3
## 25 1951-1952 31.9 66.4 19.7
## 26 1949-1950 32.0 33.8 13.8
## 27 1982-1983 32.7 26.6 27.2
## 28 1958-1959 34.1 41.0 13.0
## 29 2009-2010 35.7 54.2 51.4
## 30 2015-2016 36.2 31.2 32.1
## 31 1998-1999 36.4 50.9 12.7
## 32 1973-1974 36.9 58.3 23.5
## 33 1948-1949 37.1 14.3 46.6
## 34 1989-1990 39.2 33.8 13.4
## 35 2003-2004 39.4 24.8 42.6
## 36 2005-2006 39.9 26.0 40.0
## 37 1959-1960 40.9 50.9 39.2
## 38 1986-1987 42.5 26.2 23.1
## 39 1983-1984 43.0 49.0 25.4
## 40 1965-1966 44.1 24.9 21.4
## 41 1957-1958 44.7 20.0 44.7
## 42 1961-1962 44.7 58.9 18.1
## 43 1967-1968 44.8 28.4 19.5
## 44 1942-1943 45.7 45.2 29.5
## 45 2000-2001 45.9 39.2 35.0
## 46 1975-1976 46.6 43.3 17.3
## 47 1971-1972 47.5 46.8 22.9
## 48 1940-1941 47.8 52.5 39.0
## 49 1969-1970 48.8 77.0 25.6
## 50 1964-1965 50.4 59.5 24.4
## 51 1945-1946 50.8 23.9 31.4
## 52 2007-2008 51.2 60.3 11.9
## 53 1996-1997 51.9 40.6 10.0
## 54 1956-1957 52.0 31.3 21.9
## 55 1987-1988 52.6 42.6 19.1
## 56 1968-1969 53.8 29.4 30.2
## 57 1970-1971 57.3 37.9 15.5
## 58 1976-1977 58.5 54.1 24.5
## 59 2013-2014 58.9 82.0 57.4
## 60 1944-1945 59.2 34.9 27.1
## 61 1966-1967 60.1 68.4 51.5
## 62 1955-1956 60.9 26.3 33.5
## 63 1960-1961 61.5 40.7 54.7
## 64 1981-1982 61.8 59.3 24.6
## 65 1963-1964 63.0 35.2 44.7
## 66 2012-2013 63.4 30.1 26.1
## 67 2008-2009 65.9 52.7 27.6
## 68 2002-2003 70.9 28.6 49.3
## 69 2010-2011 81.0 57.9 61.9
## 70 1992-1993 83.9 46.9 24.5
## 71 1977-1978 85.1 82.3 50.7
## 72 2004-2005 86.6 39.4 41.0
## 73 1947-1948 89.2 38.1 63.2
## 74 1993-1994 96.3 41.8 53.4
## 75 1995-1996 107.6 23.9 75.6
## 76 2014-2015 110.6 50.7 50.3
# descencing order
arrange(snowdata, desc(Boston))## Winter Boston Chicago NYC
## 1 2014-2015 110.6 50.7 50.3
## 2 1995-1996 107.6 23.9 75.6
## 3 1993-1994 96.3 41.8 53.4
## 4 1947-1948 89.2 38.1 63.2
## 5 2004-2005 86.6 39.4 41.0
## 6 1977-1978 85.1 82.3 50.7
## 7 1992-1993 83.9 46.9 24.5
## 8 2010-2011 81.0 57.9 61.9
## 9 2002-2003 70.9 28.6 49.3
## 10 2008-2009 65.9 52.7 27.6
## 11 2012-2013 63.4 30.1 26.1
## 12 1963-1964 63.0 35.2 44.7
## 13 1981-1982 61.8 59.3 24.6
## 14 1960-1961 61.5 40.7 54.7
## 15 1955-1956 60.9 26.3 33.5
## 16 1966-1967 60.1 68.4 51.5
## 17 1944-1945 59.2 34.9 27.1
## 18 2013-2014 58.9 82.0 57.4
## 19 1976-1977 58.5 54.1 24.5
## 20 1970-1971 57.3 37.9 15.5
## 21 1968-1969 53.8 29.4 30.2
## 22 1987-1988 52.6 42.6 19.1
## 23 1956-1957 52.0 31.3 21.9
## 24 1996-1997 51.9 40.6 10.0
## 25 2007-2008 51.2 60.3 11.9
## 26 1945-1946 50.8 23.9 31.4
## 27 1964-1965 50.4 59.5 24.4
## 28 1969-1970 48.8 77.0 25.6
## 29 1940-1941 47.8 52.5 39.0
## 30 1971-1972 47.5 46.8 22.9
## 31 1975-1976 46.6 43.3 17.3
## 32 2000-2001 45.9 39.2 35.0
## 33 1942-1943 45.7 45.2 29.5
## 34 1967-1968 44.8 28.4 19.5
## 35 1957-1958 44.7 20.0 44.7
## 36 1961-1962 44.7 58.9 18.1
## 37 1965-1966 44.1 24.9 21.4
## 38 1983-1984 43.0 49.0 25.4
## 39 1986-1987 42.5 26.2 23.1
## 40 1959-1960 40.9 50.9 39.2
## 41 2005-2006 39.9 26.0 40.0
## 42 2003-2004 39.4 24.8 42.6
## 43 1989-1990 39.2 33.8 13.4
## 44 1948-1949 37.1 14.3 46.6
## 45 1973-1974 36.9 58.3 23.5
## 46 1998-1999 36.4 50.9 12.7
## 47 2015-2016 36.2 31.2 32.1
## 48 2009-2010 35.7 54.2 51.4
## 49 1958-1959 34.1 41.0 13.0
## 50 1982-1983 32.7 26.6 27.2
## 51 1949-1950 32.0 33.8 13.8
## 52 1951-1952 31.9 66.4 19.7
## 53 1962-1963 30.9 42.7 16.3
## 54 1952-1953 29.8 23.4 15.1
## 55 1950-1951 29.7 54.4 11.6
## 56 1943-1944 27.7 24.0 23.8
## 57 1974-1975 27.6 52.2 13.1
## 58 1978-1979 27.5 89.7 29.4
## 59 1984-1985 26.6 39.1 24.1
## 60 1997-1998 25.6 29.6 5.5
## 61 1954-1955 25.1 32.2 11.5
## 62 1999-2000 24.4 30.3 16.3
## 63 1941-1942 23.9 29.8 11.3
## 64 1953-1954 23.6 43.2 15.8
## 65 1980-1981 22.3 35.0 19.4
## 66 1991-1992 22.0 28.4 12.6
## 67 1946-1947 19.4 34.1 30.6
## 68 1990-1991 19.1 36.7 24.9
## 69 1985-1986 18.1 29.0 13.0
## 70 2006-2007 17.1 35.6 12.4
## 71 1988-1989 15.5 24.5 8.1
## 72 2001-2002 15.1 31.1 3.5
## 73 1994-1995 14.9 24.1 11.8
## 74 1979-1980 12.5 42.4 12.8
## 75 1972-1973 10.3 32.9 2.8
## 76 2011-2012 9.3 19.8 7.4
# by a second column
arrange(snowdata, Boston, NYC)## Winter Boston Chicago NYC
## 1 2011-2012 9.3 19.8 7.4
## 2 1972-1973 10.3 32.9 2.8
## 3 1979-1980 12.5 42.4 12.8
## 4 1994-1995 14.9 24.1 11.8
## 5 2001-2002 15.1 31.1 3.5
## 6 1988-1989 15.5 24.5 8.1
## 7 2006-2007 17.1 35.6 12.4
## 8 1985-1986 18.1 29.0 13.0
## 9 1990-1991 19.1 36.7 24.9
## 10 1946-1947 19.4 34.1 30.6
## 11 1991-1992 22.0 28.4 12.6
## 12 1980-1981 22.3 35.0 19.4
## 13 1953-1954 23.6 43.2 15.8
## 14 1941-1942 23.9 29.8 11.3
## 15 1999-2000 24.4 30.3 16.3
## 16 1954-1955 25.1 32.2 11.5
## 17 1997-1998 25.6 29.6 5.5
## 18 1984-1985 26.6 39.1 24.1
## 19 1978-1979 27.5 89.7 29.4
## 20 1974-1975 27.6 52.2 13.1
## 21 1943-1944 27.7 24.0 23.8
## 22 1950-1951 29.7 54.4 11.6
## 23 1952-1953 29.8 23.4 15.1
## 24 1962-1963 30.9 42.7 16.3
## 25 1951-1952 31.9 66.4 19.7
## 26 1949-1950 32.0 33.8 13.8
## 27 1982-1983 32.7 26.6 27.2
## 28 1958-1959 34.1 41.0 13.0
## 29 2009-2010 35.7 54.2 51.4
## 30 2015-2016 36.2 31.2 32.1
## 31 1998-1999 36.4 50.9 12.7
## 32 1973-1974 36.9 58.3 23.5
## 33 1948-1949 37.1 14.3 46.6
## 34 1989-1990 39.2 33.8 13.4
## 35 2003-2004 39.4 24.8 42.6
## 36 2005-2006 39.9 26.0 40.0
## 37 1959-1960 40.9 50.9 39.2
## 38 1986-1987 42.5 26.2 23.1
## 39 1983-1984 43.0 49.0 25.4
## 40 1965-1966 44.1 24.9 21.4
## 41 1961-1962 44.7 58.9 18.1
## 42 1957-1958 44.7 20.0 44.7
## 43 1967-1968 44.8 28.4 19.5
## 44 1942-1943 45.7 45.2 29.5
## 45 2000-2001 45.9 39.2 35.0
## 46 1975-1976 46.6 43.3 17.3
## 47 1971-1972 47.5 46.8 22.9
## 48 1940-1941 47.8 52.5 39.0
## 49 1969-1970 48.8 77.0 25.6
## 50 1964-1965 50.4 59.5 24.4
## 51 1945-1946 50.8 23.9 31.4
## 52 2007-2008 51.2 60.3 11.9
## 53 1996-1997 51.9 40.6 10.0
## 54 1956-1957 52.0 31.3 21.9
## 55 1987-1988 52.6 42.6 19.1
## 56 1968-1969 53.8 29.4 30.2
## 57 1970-1971 57.3 37.9 15.5
## 58 1976-1977 58.5 54.1 24.5
## 59 2013-2014 58.9 82.0 57.4
## 60 1944-1945 59.2 34.9 27.1
## 61 1966-1967 60.1 68.4 51.5
## 62 1955-1956 60.9 26.3 33.5
## 63 1960-1961 61.5 40.7 54.7
## 64 1981-1982 61.8 59.3 24.6
## 65 1963-1964 63.0 35.2 44.7
## 66 2012-2013 63.4 30.1 26.1
## 67 2008-2009 65.9 52.7 27.6
## 68 2002-2003 70.9 28.6 49.3
## 69 2010-2011 81.0 57.9 61.9
## 70 1992-1993 83.9 46.9 24.5
## 71 1977-1978 85.1 82.3 50.7
## 72 2004-2005 86.6 39.4 41.0
## 73 1947-1948 89.2 38.1 63.2
## 74 1993-1994 96.3 41.8 53.4
## 75 1995-1996 107.6 23.9 75.6
## 76 2014-2015 110.6 50.7 50.3
# selecting columns
select(snowdata, Winter, Boston)## Winter Boston
## 1 1940-1941 47.8
## 2 1941-1942 23.9
## 3 1942-1943 45.7
## 4 1943-1944 27.7
## 5 1944-1945 59.2
## 6 1945-1946 50.8
## 7 1946-1947 19.4
## 8 1947-1948 89.2
## 9 1948-1949 37.1
## 10 1949-1950 32.0
## 11 1950-1951 29.7
## 12 1951-1952 31.9
## 13 1952-1953 29.8
## 14 1953-1954 23.6
## 15 1954-1955 25.1
## 16 1955-1956 60.9
## 17 1956-1957 52.0
## 18 1957-1958 44.7
## 19 1958-1959 34.1
## 20 1959-1960 40.9
## 21 1960-1961 61.5
## 22 1961-1962 44.7
## 23 1962-1963 30.9
## 24 1963-1964 63.0
## 25 1964-1965 50.4
## 26 1965-1966 44.1
## 27 1966-1967 60.1
## 28 1967-1968 44.8
## 29 1968-1969 53.8
## 30 1969-1970 48.8
## 31 1970-1971 57.3
## 32 1971-1972 47.5
## 33 1972-1973 10.3
## 34 1973-1974 36.9
## 35 1974-1975 27.6
## 36 1975-1976 46.6
## 37 1976-1977 58.5
## 38 1977-1978 85.1
## 39 1978-1979 27.5
## 40 1979-1980 12.5
## 41 1980-1981 22.3
## 42 1981-1982 61.8
## 43 1982-1983 32.7
## 44 1983-1984 43.0
## 45 1984-1985 26.6
## 46 1985-1986 18.1
## 47 1986-1987 42.5
## 48 1987-1988 52.6
## 49 1988-1989 15.5
## 50 1989-1990 39.2
## 51 1990-1991 19.1
## 52 1991-1992 22.0
## 53 1992-1993 83.9
## 54 1993-1994 96.3
## 55 1994-1995 14.9
## 56 1995-1996 107.6
## 57 1996-1997 51.9
## 58 1997-1998 25.6
## 59 1998-1999 36.4
## 60 1999-2000 24.4
## 61 2000-2001 45.9
## 62 2001-2002 15.1
## 63 2002-2003 70.9
## 64 2003-2004 39.4
## 65 2004-2005 86.6
## 66 2005-2006 39.9
## 67 2006-2007 17.1
## 68 2007-2008 51.2
## 69 2008-2009 65.9
## 70 2009-2010 35.7
## 71 2010-2011 81.0
## 72 2011-2012 9.3
## 73 2012-2013 63.4
## 74 2013-2014 58.9
## 75 2014-2015 110.6
## 76 2015-2016 36.2
select(snowdata, Boston:NYC)## Boston Chicago NYC
## 1 47.8 52.5 39.0
## 2 23.9 29.8 11.3
## 3 45.7 45.2 29.5
## 4 27.7 24.0 23.8
## 5 59.2 34.9 27.1
## 6 50.8 23.9 31.4
## 7 19.4 34.1 30.6
## 8 89.2 38.1 63.2
## 9 37.1 14.3 46.6
## 10 32.0 33.8 13.8
## 11 29.7 54.4 11.6
## 12 31.9 66.4 19.7
## 13 29.8 23.4 15.1
## 14 23.6 43.2 15.8
## 15 25.1 32.2 11.5
## 16 60.9 26.3 33.5
## 17 52.0 31.3 21.9
## 18 44.7 20.0 44.7
## 19 34.1 41.0 13.0
## 20 40.9 50.9 39.2
## 21 61.5 40.7 54.7
## 22 44.7 58.9 18.1
## 23 30.9 42.7 16.3
## 24 63.0 35.2 44.7
## 25 50.4 59.5 24.4
## 26 44.1 24.9 21.4
## 27 60.1 68.4 51.5
## 28 44.8 28.4 19.5
## 29 53.8 29.4 30.2
## 30 48.8 77.0 25.6
## 31 57.3 37.9 15.5
## 32 47.5 46.8 22.9
## 33 10.3 32.9 2.8
## 34 36.9 58.3 23.5
## 35 27.6 52.2 13.1
## 36 46.6 43.3 17.3
## 37 58.5 54.1 24.5
## 38 85.1 82.3 50.7
## 39 27.5 89.7 29.4
## 40 12.5 42.4 12.8
## 41 22.3 35.0 19.4
## 42 61.8 59.3 24.6
## 43 32.7 26.6 27.2
## 44 43.0 49.0 25.4
## 45 26.6 39.1 24.1
## 46 18.1 29.0 13.0
## 47 42.5 26.2 23.1
## 48 52.6 42.6 19.1
## 49 15.5 24.5 8.1
## 50 39.2 33.8 13.4
## 51 19.1 36.7 24.9
## 52 22.0 28.4 12.6
## 53 83.9 46.9 24.5
## 54 96.3 41.8 53.4
## 55 14.9 24.1 11.8
## 56 107.6 23.9 75.6
## 57 51.9 40.6 10.0
## 58 25.6 29.6 5.5
## 59 36.4 50.9 12.7
## 60 24.4 30.3 16.3
## 61 45.9 39.2 35.0
## 62 15.1 31.1 3.5
## 63 70.9 28.6 49.3
## 64 39.4 24.8 42.6
## 65 86.6 39.4 41.0
## 66 39.9 26.0 40.0
## 67 17.1 35.6 12.4
## 68 51.2 60.3 11.9
## 69 65.9 52.7 27.6
## 70 35.7 54.2 51.4
## 71 81.0 57.9 61.9
## 72 9.3 19.8 7.4
## 73 63.4 30.1 26.1
## 74 58.9 82.0 57.4
## 75 110.6 50.7 50.3
## 76 36.2 31.2 32.1
select(snowdata, contains("C"))## Chicago NYC
## 1 52.5 39.0
## 2 29.8 11.3
## 3 45.2 29.5
## 4 24.0 23.8
## 5 34.9 27.1
## 6 23.9 31.4
## 7 34.1 30.6
## 8 38.1 63.2
## 9 14.3 46.6
## 10 33.8 13.8
## 11 54.4 11.6
## 12 66.4 19.7
## 13 23.4 15.1
## 14 43.2 15.8
## 15 32.2 11.5
## 16 26.3 33.5
## 17 31.3 21.9
## 18 20.0 44.7
## 19 41.0 13.0
## 20 50.9 39.2
## 21 40.7 54.7
## 22 58.9 18.1
## 23 42.7 16.3
## 24 35.2 44.7
## 25 59.5 24.4
## 26 24.9 21.4
## 27 68.4 51.5
## 28 28.4 19.5
## 29 29.4 30.2
## 30 77.0 25.6
## 31 37.9 15.5
## 32 46.8 22.9
## 33 32.9 2.8
## 34 58.3 23.5
## 35 52.2 13.1
## 36 43.3 17.3
## 37 54.1 24.5
## 38 82.3 50.7
## 39 89.7 29.4
## 40 42.4 12.8
## 41 35.0 19.4
## 42 59.3 24.6
## 43 26.6 27.2
## 44 49.0 25.4
## 45 39.1 24.1
## 46 29.0 13.0
## 47 26.2 23.1
## 48 42.6 19.1
## 49 24.5 8.1
## 50 33.8 13.4
## 51 36.7 24.9
## 52 28.4 12.6
## 53 46.9 24.5
## 54 41.8 53.4
## 55 24.1 11.8
## 56 23.9 75.6
## 57 40.6 10.0
## 58 29.6 5.5
## 59 50.9 12.7
## 60 30.3 16.3
## 61 39.2 35.0
## 62 31.1 3.5
## 63 28.6 49.3
## 64 24.8 42.6
## 65 39.4 41.0
## 66 26.0 40.0
## 67 35.6 12.4
## 68 60.3 11.9
## 69 52.7 27.6
## 70 54.2 51.4
## 71 57.9 61.9
## 72 19.8 7.4
## 73 30.1 26.1
## 74 82.0 57.4
## 75 50.7 50.3
## 76 31.2 32.1
select(snowdata, ends_with("C"))## NYC
## 1 39.0
## 2 11.3
## 3 29.5
## 4 23.8
## 5 27.1
## 6 31.4
## 7 30.6
## 8 63.2
## 9 46.6
## 10 13.8
## 11 11.6
## 12 19.7
## 13 15.1
## 14 15.8
## 15 11.5
## 16 33.5
## 17 21.9
## 18 44.7
## 19 13.0
## 20 39.2
## 21 54.7
## 22 18.1
## 23 16.3
## 24 44.7
## 25 24.4
## 26 21.4
## 27 51.5
## 28 19.5
## 29 30.2
## 30 25.6
## 31 15.5
## 32 22.9
## 33 2.8
## 34 23.5
## 35 13.1
## 36 17.3
## 37 24.5
## 38 50.7
## 39 29.4
## 40 12.8
## 41 19.4
## 42 24.6
## 43 27.2
## 44 25.4
## 45 24.1
## 46 13.0
## 47 23.1
## 48 19.1
## 49 8.1
## 50 13.4
## 51 24.9
## 52 12.6
## 53 24.5
## 54 53.4
## 55 11.8
## 56 75.6
## 57 10.0
## 58 5.5
## 59 12.7
## 60 16.3
## 61 35.0
## 62 3.5
## 63 49.3
## 64 42.6
## 65 41.0
## 66 40.0
## 67 12.4
## 68 11.9
## 69 27.6
## 70 51.4
## 71 61.9
## 72 7.4
## 73 26.1
## 74 57.4
## 75 50.3
## 76 32.1
select(snowdata, -ends_with("C"))## Winter Boston Chicago
## 1 1940-1941 47.8 52.5
## 2 1941-1942 23.9 29.8
## 3 1942-1943 45.7 45.2
## 4 1943-1944 27.7 24.0
## 5 1944-1945 59.2 34.9
## 6 1945-1946 50.8 23.9
## 7 1946-1947 19.4 34.1
## 8 1947-1948 89.2 38.1
## 9 1948-1949 37.1 14.3
## 10 1949-1950 32.0 33.8
## 11 1950-1951 29.7 54.4
## 12 1951-1952 31.9 66.4
## 13 1952-1953 29.8 23.4
## 14 1953-1954 23.6 43.2
## 15 1954-1955 25.1 32.2
## 16 1955-1956 60.9 26.3
## 17 1956-1957 52.0 31.3
## 18 1957-1958 44.7 20.0
## 19 1958-1959 34.1 41.0
## 20 1959-1960 40.9 50.9
## 21 1960-1961 61.5 40.7
## 22 1961-1962 44.7 58.9
## 23 1962-1963 30.9 42.7
## 24 1963-1964 63.0 35.2
## 25 1964-1965 50.4 59.5
## 26 1965-1966 44.1 24.9
## 27 1966-1967 60.1 68.4
## 28 1967-1968 44.8 28.4
## 29 1968-1969 53.8 29.4
## 30 1969-1970 48.8 77.0
## 31 1970-1971 57.3 37.9
## 32 1971-1972 47.5 46.8
## 33 1972-1973 10.3 32.9
## 34 1973-1974 36.9 58.3
## 35 1974-1975 27.6 52.2
## 36 1975-1976 46.6 43.3
## 37 1976-1977 58.5 54.1
## 38 1977-1978 85.1 82.3
## 39 1978-1979 27.5 89.7
## 40 1979-1980 12.5 42.4
## 41 1980-1981 22.3 35.0
## 42 1981-1982 61.8 59.3
## 43 1982-1983 32.7 26.6
## 44 1983-1984 43.0 49.0
## 45 1984-1985 26.6 39.1
## 46 1985-1986 18.1 29.0
## 47 1986-1987 42.5 26.2
## 48 1987-1988 52.6 42.6
## 49 1988-1989 15.5 24.5
## 50 1989-1990 39.2 33.8
## 51 1990-1991 19.1 36.7
## 52 1991-1992 22.0 28.4
## 53 1992-1993 83.9 46.9
## 54 1993-1994 96.3 41.8
## 55 1994-1995 14.9 24.1
## 56 1995-1996 107.6 23.9
## 57 1996-1997 51.9 40.6
## 58 1997-1998 25.6 29.6
## 59 1998-1999 36.4 50.9
## 60 1999-2000 24.4 30.3
## 61 2000-2001 45.9 39.2
## 62 2001-2002 15.1 31.1
## 63 2002-2003 70.9 28.6
## 64 2003-2004 39.4 24.8
## 65 2004-2005 86.6 39.4
## 66 2005-2006 39.9 26.0
## 67 2006-2007 17.1 35.6
## 68 2007-2008 51.2 60.3
## 69 2008-2009 65.9 52.7
## 70 2009-2010 35.7 54.2
## 71 2010-2011 81.0 57.9
## 72 2011-2012 9.3 19.8
## 73 2012-2013 63.4 30.1
## 74 2013-2014 58.9 82.0
## 75 2014-2015 110.6 50.7
## 76 2015-2016 36.2 31.2
snowdata_numeric <- select_if(snowdata, is.numeric)
psych::describe(snowdata_numeric)## vars n mean sd median trimmed mad min max range skew kurtosis
## Boston 1 76 44.49 22.51 42.75 42.37 22.54 9.3 110.6 101.3 0.84 0.46
## Chicago 2 76 40.88 15.71 38.00 39.21 14.23 14.3 89.7 75.4 0.97 0.66
## NYC 3 76 27.05 15.89 24.25 25.64 16.01 2.8 75.6 72.8 0.84 0.03
## se
## Boston 2.58
## Chicago 1.80
## NYC 1.82
# subsetting by row and column numbers
lastrow <- nrow(snowdata)
snowdata[lastrow, ]## Winter Boston Chicago NYC
## 76 2015-2016 36.2 31.2 32.1
# one line of code gives you the same exact result as another
identical(snowdata[76,], snowdata[lastrow,])## [1] TRUE
# getting the lowest and the highest value
range(snowdata$Boston)## [1] 9.3 110.6
# pulling the row from snowdata that has the lowest Boston winter snow total
slice(snowdata, which.min(Boston))## Winter Boston Chicago NYC
## 1 2011-2012 9.3 19.8 7.4
# tidyselect
# relocating columns with everything()
mpg %>%
select(manufacturer, cyl, everything()) %>%
glimpse()## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "c…
# selecting the last column
mpg %>%
select(last_col()) %>%
glimpse()## Rows: 234
## Columns: 1
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "compact"…
# selecting all columns except the last one
mpg %>%
select(!last_col()) %>%
glimpse()## Rows: 234
## Columns: 10
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
# you can use last_col to select the n-to-last column
mpg %>%
select(last_col(1)) %>%
glimpse()## Rows: 234
## Columns: 1
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
# select all columns that start with the letter “m”
mpg %>%
select(starts_with("m")) %>%
glimpse()## Rows: 234
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
mpg %>%
select(ends_with(c("l", "r"))) %>%
glimpse()## Rows: 234
## Columns: 6
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
# selecting columns that contain certain strings
mpg %>%
select(contains("m")) %>%
glimpse()## Rows: 234
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
mpg %>%
rename(Manufacturer = manufacturer) %>%
select(contains("m", ignore.case = FALSE)) %>%
glimpse()## Rows: 234
## Columns: 1
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "a4 quat…
# selecting all columns that contain a number
billboard %>%
select(matches("\\d")) %>%
colnames()## [1] "wk1" "wk2" "wk3" "wk4" "wk5" "wk6" "wk7" "wk8" "wk9" "wk10"
## [11] "wk11" "wk12" "wk13" "wk14" "wk15" "wk16" "wk17" "wk18" "wk19" "wk20"
## [21] "wk21" "wk22" "wk23" "wk24" "wk25" "wk26" "wk27" "wk28" "wk29" "wk30"
## [31] "wk31" "wk32" "wk33" "wk34" "wk35" "wk36" "wk37" "wk38" "wk39" "wk40"
## [41] "wk41" "wk42" "wk43" "wk44" "wk45" "wk46" "wk47" "wk48" "wk49" "wk50"
## [51] "wk51" "wk52" "wk53" "wk54" "wk55" "wk56" "wk57" "wk58" "wk59" "wk60"
## [61] "wk61" "wk62" "wk63" "wk64" "wk65" "wk66" "wk67" "wk68" "wk69" "wk70"
## [71] "wk71" "wk72" "wk73" "wk74" "wk75" "wk76"
billboard %>%
select(matches("wk\\d{1}$")) %>%
colnames()## [1] "wk1" "wk2" "wk3" "wk4" "wk5" "wk6" "wk7" "wk8" "wk9"
anscombe %>%
select(matches("[xy][1-2]")) %>%
glimpse()## Rows: 11
## Columns: 4
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ y1 <dbl> 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68
## $ y2 <dbl> 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74
# selecting columns with number ranges
anscombe %>%
select(num_range("x", 1:2)) %>%
glimpse()## Rows: 11
## Columns: 2
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
billboard %>%
select(num_range("wk", 1:15)) %>%
glimpse()## Rows: 317
## Columns: 15
## $ wk1 <dbl> 87, 91, 81, 76, 57, 51, 97, 84, 59, 76, 84, 57, 50, 71, 79, 80, 9…
## $ wk2 <dbl> 82, 87, 70, 76, 34, 39, 97, 62, 53, 76, 84, 47, 39, 51, 65, 78, 9…
## $ wk3 <dbl> 72, 92, 68, 72, 25, 34, 96, 51, 38, 74, 75, 45, 30, 28, 53, 76, 9…
## $ wk4 <dbl> 77, NA, 67, 69, 17, 26, 95, 41, 28, 69, 73, 29, 28, 18, 48, 77, 9…
## $ wk5 <dbl> 87, NA, 66, 67, 17, 26, 100, 38, 21, 68, 73, 23, 21, 13, 45, 92, …
## $ wk6 <dbl> 94, NA, 57, 65, 31, 19, NA, 35, 18, 67, 69, 18, 19, 13, 36, NA, 9…
## $ wk7 <dbl> 99, NA, 54, 55, 36, 2, NA, 35, 16, 61, 68, 11, 20, 11, 34, NA, 93…
## $ wk8 <dbl> NA, NA, 53, 59, 49, 2, NA, 38, 14, 58, 65, 9, 17, 1, 29, NA, 96, …
## $ wk9 <dbl> NA, NA, 51, 62, 53, 3, NA, 38, 12, 57, 73, 9, 17, 1, 27, NA, NA, …
## $ wk10 <dbl> NA, NA, 51, 61, 57, 6, NA, 36, 10, 59, 83, 11, 17, 2, 30, NA, NA,…
## $ wk11 <dbl> NA, NA, 51, 61, 64, 7, NA, 37, 9, 66, 92, 1, 17, 2, 36, NA, 99, N…
## $ wk12 <dbl> NA, NA, 51, 59, 70, 22, NA, 37, 8, 68, NA, 1, 3, 3, 37, NA, NA, 9…
## $ wk13 <dbl> NA, NA, 47, 61, 75, 29, NA, 38, 6, 61, NA, 1, 3, 3, 39, NA, 96, N…
## $ wk14 <dbl> NA, NA, 44, 66, 76, 36, NA, 49, 1, 67, NA, 1, 7, 4, 49, NA, 96, N…
## $ wk15 <dbl> NA, NA, 38, 72, 78, 47, NA, 61, 2, 59, NA, 4, 10, 12, 57, NA, 99,…
# selecting columns of a specific type
billboard %>%
select(where(is.character)) %>%
glimpse()## Rows: 317
## Columns: 2
## $ artist <chr> "2 Pac", "2Ge+her", "3 Doors Down", "3 Doors Down", "504 Boyz",…
## $ track <chr> "Baby Don't Cry (Keep...", "The Hardest Part Of ...", "Kryptoni…
# combining selections
mpg %>%
select(where(is.character) & contains("l")) %>%
glimpse()## Rows: 234
## Columns: 3
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "a4 quat…
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "compact"…
mpg %>%
select(where(is.character) | contains("l")) %>%
glimpse()## Rows: 234
## Columns: 8
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "c…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
data.table and the tidyverse
packages offer simpler solutions and speed up the workflow for the types
of problems we are dealing with. Both can be used for the same tasks.
You can learn one of them or both. The syntax used for data.table is
often more concise and arguably more consistent than that in dplyr (it
is in essence an extension of the [i, j] notation that we
have already used for data frames). Second, it is fast and
memory-efficient, which makes a huge difference if you are working with
big data. On the other hand, many people prefer the syntax in dplyr and
tidyr, which lends itself exceptionally well for usage
with pipes. If you work with small or medium-sized datasets, the
difference in performance between the two packages is negligible.
dplyr is also much better suited for working directly
with databases, which is a huge selling point if your data already is in
a database.
data.table relies heavily on the [i, j]
notation that is used for data frames in R. It also adds a third
element: [i, j, by]. Using this, R selects the rows
indicated by i, the columns indicated by j and groups them by by. This
makes it easy e.g. to compute grouped summaries. With the
tidyverse packages you will instead use new functions
with names like filter and summarise to
perform operations on your data. These are typically combined using the
pipe operator, %>%, which makes the code flow nicely
from left to right.
data.table syntax, as it only works on data.table objects. Luckily, dplyr works perfectly when used on data.table objects. Note that when using data.table, there is not an explicit assignment. Note that when using data.table, there is not an explicit assignment. We don”t use <- to assign the new data frame to aq - instead the assignment happens automatically. This means that you have to be a little bit careful, so that you don”t inadvertently data when playing around with it.
When working with tidyverse packages, commands are
usually chained together using %>% pipes. When using
data.table, commands are chained by repeated use of
[] brackets on the same line.
To change the name of a variable, we can use setnames
from data.table or rename from
dplyr.
You”ll frequently want to filter away some rows from your data. Perhaps you only want to select rows where a variable exceeds some value, or want to exclude rows with NA values. This can be done in several different ways: using row numbers, using conditions, at random, or using regular expressions.
In some situations you may wish to draw a random sample from your
data. This is done using the sample (data.table) and
sample_n (dplyr) functions.
In some cases, particularly when working with text data, you”ll want
to filter using regular expressions. data.table has a
convenience function called %like% that can be used to call
grepl in an alternative (less opaque?) way. With
dplyr we use grepl in the usual
fashion.
Another common situation is that you want to remove some variables from your data. Perhaps the variables aren”t of interest in a particular analysis that you”re going to perform, or perhaps you”ve simply imported more variables than you need. As with rows, this can be done using numbers, names or regular expressions. When selecting a single column from a data frame, you sometimes want to extract the column as a vector and sometimes as a single-column data frame (for instance if you are going to pass it to a function that takes a data frame as input). You should be a little bit careful when doing this, to make sure that you get the column in the correct format.
In data.table, using regular expressions to select
columns is done using grep. dplyr differs
in that it has several convenience functions for selecting columns, like
starts_with, ends_with,
contains.
Sometimes you don”t want to filter rows, but rearrange their order according to their values for some variable. Similarly, you may want to change the order of the columns in your data. This is often useful for presentation purposes, but can at times also aid in analyses.
In some cases, you may want to fill missing values of a variable with
the previous non-missing entry. To fill the missing values with the last
non-missing entry, we can now use nafill or
fill.
dplyr’s distinct() function will remove data frame rows
based on duplication in certain columns. If you want to keep all the
other variables in a data frame and not just the non-repetitive ones,
distinct() needs the additional
.keep_all = TRUE argument. Otherwise, it will return a data
frame with only the columns you want to ensure haven’t been
duplicated.
aq <- as.data.table(airquality)
# modifying a variable
# data.table
aq[, Wind := Wind * 0.44704]
# dplyr
aq %>% mutate(Wind = Wind * 0.44704) -> aq
# computing a new variable based on existing variables
# data.table
aq[, Hot := Temp > 90]
# dplyr
aq %>% mutate(Hot = Temp > 90) -> aq
# renaming a variable
# data.table
setnames(aq, "Hot", "HotDay")
# dplyr
# aq %>% rename(HotDay = Hot) -> aq
# removing a variable
# data.table
aq[, HotDay := NULL]
aq[, c("Month", "Day") := NULL] # multiple cols
# dplyr
# aq %>% select(-HotDay) -> aq
# aq %>% select(-Month, -Day) -> aq # multiple cols
# chaining commands
# data.table
aq <- as.data.table(airquality)
aq[, Month := nafill(Month, "locf")][, .N, Month]## Month N
## 1: 5 31
## 2: 6 30
## 3: 7 31
## 4: 8 31
## 5: 9 30
# dplyr
aq %>%
fill(Month) %>%
group_by(Month, across(days = n()))## # A tibble: 153 × 7
## # Groups: Month, Ozone, Solar.R, Wind, Temp, Day, Hot [153]
## Ozone Solar.R Wind Temp Month Day Hot
## <int> <int> <dbl> <int> <int> <int> <lgl>
## 1 41 190 7.4 67 5 1 FALSE
## 2 36 118 8 72 5 2 FALSE
## 3 12 149 12.6 74 5 3 FALSE
## 4 18 313 11.5 62 5 4 FALSE
## 5 NA NA 14.3 56 5 5 FALSE
## 6 28 NA 14.9 66 5 6 FALSE
## 7 23 299 8.6 65 5 7 FALSE
## 8 19 99 13.8 59 5 8 FALSE
## 9 8 19 20.1 61 5 9 FALSE
## 10 NA 194 8.6 69 5 10 FALSE
## # … with 143 more rows
# realoading the data
aq <- as.data.table(airquality)
# filtering using row numbers
# data.table
aq[3, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
aq[3:5, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
## 2: 18 313 11.5 62 5 4 FALSE
## 3: NA NA 14.3 56 5 5 FALSE
aq[c(3, 7, 15), ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
## 2: 23 299 8.6 65 5 7 FALSE
## 3: 18 65 13.2 58 5 15 FALSE
aq[-c(3, 7, 15), ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 18 313 11.5 62 5 4 FALSE
## 4: NA NA 14.3 56 5 5 FALSE
## 5: 28 NA 14.9 66 5 6 FALSE
## ---
## 146: 30 193 6.9 70 9 26 FALSE
## 147: NA 145 13.2 77 9 27 FALSE
## 148: 14 191 14.3 75 9 28 FALSE
## 149: 18 131 8.0 76 9 29 FALSE
## 150: 20 223 11.5 68 9 30 FALSE
# dplyr
aq %>% slice(3)## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
aq %>% slice(3:5)## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
## 2: 18 313 11.5 62 5 4 FALSE
## 3: NA NA 14.3 56 5 5 FALSE
aq %>% slice(c(3, 7, 15))## Ozone Solar.R Wind Temp Month Day Hot
## 1: 12 149 12.6 74 5 3 FALSE
## 2: 23 299 8.6 65 5 7 FALSE
## 3: 18 65 13.2 58 5 15 FALSE
aq %>% slice(-c(3, 7, 15))## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 18 313 11.5 62 5 4 FALSE
## 4: NA NA 14.3 56 5 5 FALSE
## 5: 28 NA 14.9 66 5 6 FALSE
## ---
## 146: 30 193 6.9 70 9 26 FALSE
## 147: NA 145 13.2 77 9 27 FALSE
## 148: 14 191 14.3 75 9 28 FALSE
## 149: 18 131 8.0 76 9 29 FALSE
## 150: 20 223 11.5 68 9 30 FALSE
# filtering using conditions
# data.table
aq[Temp > 90, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: NA 250 9.2 92 6 12 TRUE
## 3: 97 267 6.3 92 7 8 TRUE
## 4: 97 272 5.7 92 7 9 TRUE
## 5: NA 291 14.9 91 7 14 TRUE
## 6: NA 222 8.6 92 8 10 TRUE
## 7: 76 203 9.7 97 8 28 TRUE
## 8: 118 225 2.3 94 8 29 TRUE
## 9: 84 237 6.3 96 8 30 TRUE
## 10: 85 188 6.3 94 8 31 TRUE
## 11: 96 167 6.9 91 9 1 TRUE
## 12: 78 197 5.1 92 9 2 TRUE
## 13: 73 183 2.8 93 9 3 TRUE
## 14: 91 189 4.6 93 9 4 TRUE
aq[Month == 6, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 286 8.6 78 6 1 FALSE
## 2: NA 287 9.7 74 6 2 FALSE
## 3: NA 242 16.1 67 6 3 FALSE
## 4: NA 186 9.2 84 6 4 FALSE
## 5: NA 220 8.6 85 6 5 FALSE
## 6: NA 264 14.3 79 6 6 FALSE
## 7: 29 127 9.7 82 6 7 FALSE
## 8: NA 273 6.9 87 6 8 FALSE
## 9: 71 291 13.8 90 6 9 FALSE
## 10: 39 323 11.5 87 6 10 FALSE
## 11: NA 259 10.9 93 6 11 TRUE
## 12: NA 250 9.2 92 6 12 TRUE
## 13: 23 148 8.0 82 6 13 FALSE
## 14: NA 332 13.8 80 6 14 FALSE
## 15: NA 322 11.5 79 6 15 FALSE
## 16: 21 191 14.9 77 6 16 FALSE
## 17: 37 284 20.7 72 6 17 FALSE
## 18: 20 37 9.2 65 6 18 FALSE
## 19: 12 120 11.5 73 6 19 FALSE
## 20: 13 137 10.3 76 6 20 FALSE
## 21: NA 150 6.3 77 6 21 FALSE
## 22: NA 59 1.7 76 6 22 FALSE
## 23: NA 91 4.6 76 6 23 FALSE
## 24: NA 250 6.3 76 6 24 FALSE
## 25: NA 135 8.0 75 6 25 FALSE
## 26: NA 127 8.0 78 6 26 FALSE
## 27: NA 47 10.3 73 6 27 FALSE
## 28: NA 98 11.5 80 6 28 FALSE
## 29: NA 31 14.9 77 6 29 FALSE
## 30: NA 138 8.0 83 6 30 FALSE
## Ozone Solar.R Wind Temp Month Day Hot
aq[Temp > 90 & Month == 6, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: NA 250 9.2 92 6 12 TRUE
aq[Temp %between% c(80, 90), ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 45 252 14.9 81 5 29 FALSE
## 2: NA 186 9.2 84 6 4 FALSE
## 3: NA 220 8.6 85 6 5 FALSE
## 4: 29 127 9.7 82 6 7 FALSE
## 5: NA 273 6.9 87 6 8 FALSE
## 6: 71 291 13.8 90 6 9 FALSE
## 7: 39 323 11.5 87 6 10 FALSE
## 8: 23 148 8.0 82 6 13 FALSE
## 9: NA 332 13.8 80 6 14 FALSE
## 10: NA 98 11.5 80 6 28 FALSE
## 11: NA 138 8.0 83 6 30 FALSE
## 12: 135 269 4.1 84 7 1 FALSE
## 13: 49 248 9.2 85 7 2 FALSE
## 14: 32 236 9.2 81 7 3 FALSE
## 15: NA 101 10.9 84 7 4 FALSE
## 16: 64 175 4.6 83 7 5 FALSE
## 17: 40 314 10.9 83 7 6 FALSE
## 18: 77 276 5.1 88 7 7 FALSE
## 19: 85 175 7.4 89 7 10 FALSE
## 20: NA 139 8.6 82 7 11 FALSE
## 21: 27 175 14.9 81 7 13 FALSE
## 22: 7 48 14.3 80 7 15 FALSE
## 23: 48 260 6.9 81 7 16 FALSE
## 24: 35 274 10.3 82 7 17 FALSE
## 25: 61 285 6.3 84 7 18 FALSE
## 26: 79 187 5.1 87 7 19 FALSE
## 27: 63 220 11.5 85 7 20 FALSE
## 28: NA 258 9.7 81 7 22 FALSE
## 29: NA 295 11.5 82 7 23 FALSE
## 30: 80 294 8.6 86 7 24 FALSE
## 31: 108 223 8.0 85 7 25 FALSE
## 32: 20 81 8.6 82 7 26 FALSE
## 33: 52 82 12.0 86 7 27 FALSE
## 34: 82 213 7.4 88 7 28 FALSE
## 35: 50 275 7.4 86 7 29 FALSE
## 36: 64 253 7.4 83 7 30 FALSE
## 37: 59 254 9.2 81 7 31 FALSE
## 38: 39 83 6.9 81 8 1 FALSE
## 39: 9 24 13.8 81 8 2 FALSE
## 40: 16 77 7.4 82 8 3 FALSE
## 41: 78 NA 6.9 86 8 4 FALSE
## 42: 35 NA 7.4 85 8 5 FALSE
## 43: 66 NA 4.6 87 8 6 FALSE
## 44: 122 255 4.0 89 8 7 FALSE
## 45: 89 229 10.3 90 8 8 FALSE
## 46: 110 207 8.0 90 8 9 FALSE
## 47: NA 137 11.5 86 8 11 FALSE
## 48: 44 192 11.5 86 8 12 FALSE
## 49: 28 273 11.5 82 8 13 FALSE
## 50: 65 157 9.7 80 8 14 FALSE
## 51: 168 238 3.4 81 8 25 FALSE
## 52: 73 215 8.0 86 8 26 FALSE
## 53: NA 153 5.7 88 8 27 FALSE
## 54: 47 95 7.4 87 9 5 FALSE
## 55: 32 92 15.5 84 9 6 FALSE
## 56: 20 252 10.9 80 9 7 FALSE
## 57: 44 236 14.9 81 9 11 FALSE
## 58: 16 201 8.0 82 9 20 FALSE
## 59: 36 139 10.3 81 9 23 FALSE
## Ozone Solar.R Wind Temp Month Day Hot
aq[frankv(-Temp,
ties.method = "min"
) <= 5, ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: 76 203 9.7 97 8 28 TRUE
## 3: 118 225 2.3 94 8 29 TRUE
## 4: 84 237 6.3 96 8 30 TRUE
## 5: 85 188 6.3 94 8 31 TRUE
## 6: 73 183 2.8 93 9 3 TRUE
## 7: 91 189 4.6 93 9 4 TRUE
unique(aq) # removing duplicate rows## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: NA NA 14.3 56 5 5 FALSE
## ---
## 149: 30 193 6.9 70 9 26 FALSE
## 150: NA 145 13.2 77 9 27 FALSE
## 151: 14 191 14.3 75 9 28 FALSE
## 152: 18 131 8.0 76 9 29 FALSE
## 153: 20 223 11.5 68 9 30 FALSE
na.omit(aq) # removing rows with missing data## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: 23 299 8.6 65 5 7 FALSE
## ---
## 107: 14 20 16.6 63 9 25 FALSE
## 108: 30 193 6.9 70 9 26 FALSE
## 109: 14 191 14.3 75 9 28 FALSE
## 110: 18 131 8.0 76 9 29 FALSE
## 111: 20 223 11.5 68 9 30 FALSE
na.omit(aq, "Ozone") # removing rows with missing Ozone values## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: 28 NA 14.9 66 5 6 FALSE
## ---
## 112: 14 20 16.6 63 9 25 FALSE
## 113: 30 193 6.9 70 9 26 FALSE
## 114: 14 191 14.3 75 9 28 FALSE
## 115: 18 131 8.0 76 9 29 FALSE
## 116: 20 223 11.5 68 9 30 FALSE
# dplyr
aq %>% filter(Temp > 90)## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: NA 250 9.2 92 6 12 TRUE
## 3: 97 267 6.3 92 7 8 TRUE
## 4: 97 272 5.7 92 7 9 TRUE
## 5: NA 291 14.9 91 7 14 TRUE
## 6: NA 222 8.6 92 8 10 TRUE
## 7: 76 203 9.7 97 8 28 TRUE
## 8: 118 225 2.3 94 8 29 TRUE
## 9: 84 237 6.3 96 8 30 TRUE
## 10: 85 188 6.3 94 8 31 TRUE
## 11: 96 167 6.9 91 9 1 TRUE
## 12: 78 197 5.1 92 9 2 TRUE
## 13: 73 183 2.8 93 9 3 TRUE
## 14: 91 189 4.6 93 9 4 TRUE
aq %>% filter(Month == 6)## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 286 8.6 78 6 1 FALSE
## 2: NA 287 9.7 74 6 2 FALSE
## 3: NA 242 16.1 67 6 3 FALSE
## 4: NA 186 9.2 84 6 4 FALSE
## 5: NA 220 8.6 85 6 5 FALSE
## 6: NA 264 14.3 79 6 6 FALSE
## 7: 29 127 9.7 82 6 7 FALSE
## 8: NA 273 6.9 87 6 8 FALSE
## 9: 71 291 13.8 90 6 9 FALSE
## 10: 39 323 11.5 87 6 10 FALSE
## 11: NA 259 10.9 93 6 11 TRUE
## 12: NA 250 9.2 92 6 12 TRUE
## 13: 23 148 8.0 82 6 13 FALSE
## 14: NA 332 13.8 80 6 14 FALSE
## 15: NA 322 11.5 79 6 15 FALSE
## 16: 21 191 14.9 77 6 16 FALSE
## 17: 37 284 20.7 72 6 17 FALSE
## 18: 20 37 9.2 65 6 18 FALSE
## 19: 12 120 11.5 73 6 19 FALSE
## 20: 13 137 10.3 76 6 20 FALSE
## 21: NA 150 6.3 77 6 21 FALSE
## 22: NA 59 1.7 76 6 22 FALSE
## 23: NA 91 4.6 76 6 23 FALSE
## 24: NA 250 6.3 76 6 24 FALSE
## 25: NA 135 8.0 75 6 25 FALSE
## 26: NA 127 8.0 78 6 26 FALSE
## 27: NA 47 10.3 73 6 27 FALSE
## 28: NA 98 11.5 80 6 28 FALSE
## 29: NA 31 14.9 77 6 29 FALSE
## 30: NA 138 8.0 83 6 30 FALSE
## Ozone Solar.R Wind Temp Month Day Hot
aq %>% filter(
Temp > 90,
Month == 6
)## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: NA 250 9.2 92 6 12 TRUE
aq %>% filter(dplyr::between(Temp, 70, 90))## Ozone Solar.R Wind Temp Month Day Hot
## 1: 36 118 8.0 72 5 2 FALSE
## 2: 12 149 12.6 74 5 3 FALSE
## 3: 7 NA 6.9 74 5 11 FALSE
## 4: 11 320 16.6 73 5 22 FALSE
## 5: 45 252 14.9 81 5 29 FALSE
## ---
## 103: 36 139 10.3 81 9 23 FALSE
## 104: 30 193 6.9 70 9 26 FALSE
## 105: NA 145 13.2 77 9 27 FALSE
## 106: 14 191 14.3 75 9 28 FALSE
## 107: 18 131 8.0 76 9 29 FALSE
aq %>% top_n(5, Temp)## Ozone Solar.R Wind Temp Month Day Hot
## 1: NA 259 10.9 93 6 11 TRUE
## 2: 76 203 9.7 97 8 28 TRUE
## 3: 118 225 2.3 94 8 29 TRUE
## 4: 84 237 6.3 96 8 30 TRUE
## 5: 85 188 6.3 94 8 31 TRUE
## 6: 73 183 2.8 93 9 3 TRUE
## 7: 91 189 4.6 93 9 4 TRUE
aq %>% distinct # removing duplicate rows based on duplication on certain columns## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: NA NA 14.3 56 5 5 FALSE
## ---
## 149: 30 193 6.9 70 9 26 FALSE
## 150: NA 145 13.2 77 9 27 FALSE
## 151: 14 191 14.3 75 9 28 FALSE
## 152: 18 131 8.0 76 9 29 FALSE
## 153: 20 223 11.5 68 9 30 FALSE
# another example of distinct()
# contributions <- map_df(list.files("mayor_finance_reports",
# full.names = TRUE
# ), rio::import) %>%
# filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
# distinct(Contributor, Address, .keep_all = TRUE)
aq %>% drop_na # removing rows with missing data## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: 23 299 8.6 65 5 7 FALSE
## ---
## 107: 14 20 16.6 63 9 25 FALSE
## 108: 30 193 6.9 70 9 26 FALSE
## 109: 14 191 14.3 75 9 28 FALSE
## 110: 18 131 8.0 76 9 29 FALSE
## 111: 20 223 11.5 68 9 30 FALSE
aq %>% drop_na("Ozone") # removing rows with missing Ozone values## Ozone Solar.R Wind Temp Month Day Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: 28 NA 14.9 66 5 6 FALSE
## ---
## 112: 14 20 16.6 63 9 25 FALSE
## 113: 30 193 6.9 70 9 26 FALSE
## 114: 14 191 14.3 75 9 28 FALSE
## 115: 18 131 8.0 76 9 29 FALSE
## 116: 20 223 11.5 68 9 30 FALSE
# selecting rows at random
# data.table
aq[sample(.N, 5), ]## Ozone Solar.R Wind Temp Month Day Hot
## 1: 7 49 10.3 69 9 24 FALSE
## 2: 35 274 10.3 82 7 17 FALSE
## 3: 19 99 13.8 59 5 8 FALSE
## 4: 79 187 5.1 87 7 19 FALSE
## 5: 9 36 14.3 72 8 22 FALSE
# dplyr
aq %>% sample_n(5)## Ozone Solar.R Wind Temp Month Day Hot
## 1: 61 285 6.3 84 7 18 FALSE
## 2: 28 NA 14.9 66 5 6 FALSE
## 3: 22 71 10.3 77 8 16 FALSE
## 4: 44 236 14.9 81 9 11 FALSE
## 5: 44 192 11.5 86 8 12 FALSE
# using regular expressions to select rows
dogs <- data.table(
Name = c(
"Bianca", "Bella", "Mimmi", "Daisy",
"Ernst", "Smulan"
),
Breed = c(
"Greyhound", "Greyhound", "Pug", "Poodle",
"Bedlington Terrier", "Boxer"
),
Desc = c(
"Fast, playful", "Fast, easily worried",
"Intense, small, loud",
"Majestic, protective, playful",
"Playful, relaxed",
"Loving, cuddly, playful"
)
)
# data.table
dogs[Name %like% "^B", ]## Name Breed Desc
## 1: Bianca Greyhound Fast, playful
## 2: Bella Greyhound Fast, easily worried
# or:
dogs[grepl("^B", Name), ]## Name Breed Desc
## 1: Bianca Greyhound Fast, playful
## 2: Bella Greyhound Fast, easily worried
dogs[Desc %like% "[pP]layful", ]## Name Breed Desc
## 1: Bianca Greyhound Fast, playful
## 2: Daisy Poodle Majestic, protective, playful
## 3: Ernst Bedlington Terrier Playful, relaxed
## 4: Smulan Boxer Loving, cuddly, playful
# dplyr
dogs %>% filter(grepl("B[a-z]", Name))## Name Breed Desc
## 1: Bianca Greyhound Fast, playful
## 2: Bella Greyhound Fast, easily worried
dogs %>% filter(grepl("[pP]layful", Desc))## Name Breed Desc
## 1: Bianca Greyhound Fast, playful
## 2: Daisy Poodle Majestic, protective, playful
## 3: Ernst Bedlington Terrier Playful, relaxed
## 4: Smulan Boxer Loving, cuddly, playful
# selecting a single column
# data.table
# Return a vector:
aq$Temp## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# or
aq[, Temp]## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# Return a data.table:
aq[, "Temp"]## Temp
## 1: 67
## 2: 72
## 3: 74
## 4: 62
## 5: 56
## ---
## 149: 70
## 150: 77
## 151: 75
## 152: 76
## 153: 68
aq[, .(Temp, Month, Day)]## Temp Month Day
## 1: 67 5 1
## 2: 72 5 2
## 3: 74 5 3
## 4: 62 5 4
## 5: 56 5 5
## ---
## 149: 70 9 26
## 150: 77 9 27
## 151: 75 9 28
## 152: 76 9 29
## 153: 68 9 30
aq[, Wind:Month]## Wind Temp Month
## 1: 7.4 67 5
## 2: 8.0 72 5
## 3: 12.6 74 5
## 4: 11.5 62 5
## 5: 14.3 56 5
## ---
## 149: 6.9 70 9
## 150: 13.2 77 9
## 151: 14.3 75 9
## 152: 8.0 76 9
## 153: 11.5 68 9
aq[, -c("Month", "Day")]## Ozone Solar.R Wind Temp Hot
## 1: 41 190 7.4 67 FALSE
## 2: 36 118 8.0 72 FALSE
## 3: 12 149 12.6 74 FALSE
## 4: 18 313 11.5 62 FALSE
## 5: NA NA 14.3 56 FALSE
## ---
## 149: 30 193 6.9 70 FALSE
## 150: NA 145 13.2 77 FALSE
## 151: 14 191 14.3 75 FALSE
## 152: 18 131 8.0 76 FALSE
## 153: 20 223 11.5 68 FALSE
aq[, sapply(msleep, class) == "numeric"] # selecting all numeric variables## name genus vore order conservation sleep_total
## FALSE FALSE FALSE FALSE FALSE TRUE
## sleep_rem sleep_cycle awake brainwt bodywt
## TRUE TRUE TRUE TRUE TRUE
aq[, .SD, .SDcols = colSums(is.na(aq)) == 0] # removing cols with missing values## Wind Temp Month Day Hot
## 1: 7.4 67 5 1 FALSE
## 2: 8.0 72 5 2 FALSE
## 3: 12.6 74 5 3 FALSE
## 4: 11.5 62 5 4 FALSE
## 5: 14.3 56 5 5 FALSE
## ---
## 149: 6.9 70 9 26 FALSE
## 150: 13.2 77 9 27 FALSE
## 151: 14.3 75 9 28 FALSE
## 152: 8.0 76 9 29 FALSE
## 153: 11.5 68 9 30 FALSE
# dplyr
# Return a vector:
aq$Temp## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# or
aq %>% pull(Temp)## [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
## [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
## [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# Return a tibble:
aq %>% select(Temp)## Temp
## 1: 67
## 2: 72
## 3: 74
## 4: 62
## 5: 56
## ---
## 149: 70
## 150: 77
## 151: 75
## 152: 76
## 153: 68
aq %>% select(Temp, Month, Day)## Temp Month Day
## 1: 67 5 1
## 2: 72 5 2
## 3: 74 5 3
## 4: 62 5 4
## 5: 56 5 5
## ---
## 149: 70 9 26
## 150: 77 9 27
## 151: 75 9 28
## 152: 76 9 29
## 153: 68 9 30
aq %>% select(Wind:Month)## Wind Temp Month
## 1: 7.4 67 5
## 2: 8.0 72 5
## 3: 12.6 74 5
## 4: 11.5 62 5
## 5: 14.3 56 5
## ---
## 149: 6.9 70 9
## 150: 13.2 77 9
## 151: 14.3 75 9
## 152: 8.0 76 9
## 153: 11.5 68 9
aq %>% select(-Month, -Day)## Ozone Solar.R Wind Temp Hot
## 1: 41 190 7.4 67 FALSE
## 2: 36 118 8.0 72 FALSE
## 3: 12 149 12.6 74 FALSE
## 4: 18 313 11.5 62 FALSE
## 5: NA NA 14.3 56 FALSE
## ---
## 149: 30 193 6.9 70 FALSE
## 150: NA 145 13.2 77 FALSE
## 151: 14 191 14.3 75 FALSE
## 152: 18 131 8.0 76 FALSE
## 153: 20 223 11.5 68 FALSE
aq %>% select_if(is.numeric)## Ozone Solar.R Wind Temp Month Day
## 1: 41 190 7.4 67 5 1
## 2: 36 118 8.0 72 5 2
## 3: 12 149 12.6 74 5 3
## 4: 18 313 11.5 62 5 4
## 5: NA NA 14.3 56 5 5
## ---
## 149: 30 193 6.9 70 9 26
## 150: NA 145 13.2 77 9 27
## 151: 14 191 14.3 75 9 28
## 152: 18 131 8.0 76 9 29
## 153: 20 223 11.5 68 9 30
aq %>% select_if(~all(!is.na(.)))## Wind Temp Month Day Hot
## 1: 7.4 67 5 1 FALSE
## 2: 8.0 72 5 2 FALSE
## 3: 12.6 74 5 3 FALSE
## 4: 11.5 62 5 4 FALSE
## 5: 14.3 56 5 5 FALSE
## ---
## 149: 6.9 70 9 26 FALSE
## 150: 13.2 77 9 27 FALSE
## 151: 14.3 75 9 28 FALSE
## 152: 8.0 76 9 29 FALSE
## 153: 11.5 68 9 30 FALSE
# using regular expressions to select columns
# data.table
vars <- grepl("n", names(aq))
aq[, ..vars]## Ozone Wind Month
## 1: 41 7.4 5
## 2: 36 8.0 5
## 3: 12 12.6 5
## 4: 18 11.5 5
## 5: NA 14.3 5
## ---
## 149: 30 6.9 9
## 150: NA 13.2 9
## 151: 14 14.3 9
## 152: 18 8.0 9
## 153: 20 11.5 9
# dplyr
# contains is a convenience
# function for checking if a name
# contains a string:
aq %>% select(contains("n"))## Ozone Wind Month
## 1: 41 7.4 5
## 2: 36 8.0 5
## 3: 12 12.6 5
## 4: 18 11.5 5
## 5: NA 14.3 5
## ---
## 149: 30 6.9 9
## 150: NA 13.2 9
## 151: 14 14.3 9
## 152: 18 8.0 9
## 153: 20 11.5 9
# matches can be used with any
# regular expression:
aq %>% select(matches("n"))## Ozone Wind Month
## 1: 41 7.4 5
## 2: 36 8.0 5
## 3: 12 12.6 5
## 4: 18 11.5 5
## 5: NA 14.3 5
## ---
## 149: 30 6.9 9
## 150: NA 13.2 9
## 151: 14 14.3 9
## 152: 18 8.0 9
## 153: 20 11.5 9
# subsetting using columns numbers
# using column numbers can yield different results depending on what type of data table you"re using
aq <- as.data.frame(airquality)
str(aq[,2])## int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# data.table:
aq <- as.data.table(airquality)
str(aq[,2])## Classes 'data.table' and 'data.frame': 153 obs. of 1 variable:
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## - attr(*, ".internal.selfref")=<externalptr>
# tibble:
aq <- as_tibble(airquality)
str(aq[,2])## tibble [153 × 1] (S3: tbl_df/tbl/data.frame)
## $ Solar.R: int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# aq[[2]] works the same for data frames, data tables and tibbles, returning a vector
aq <- as.data.frame(airquality)
str(aq[[2]])## int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# data.table:
aq <- as.data.table(airquality)
str(aq[[2]])## int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# tibble:
aq <- as_tibble(airquality)
str(aq[[2]])## int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# changing the column order
# data.table
setcolorder(aq, c("Month", "Day"))
# dplyr
aq %>% relocate("Month", "Day")## # A tibble: 153 × 7
## Month Day Ozone Solar.R Wind Temp Hot
## <int> <int> <int> <int> <dbl> <int> <lgl>
## 1 5 1 41 190 7.4 67 FALSE
## 2 5 2 36 118 8 72 FALSE
## 3 5 3 12 149 12.6 74 FALSE
## 4 5 4 18 313 11.5 62 FALSE
## 5 5 5 NA NA 14.3 56 FALSE
## 6 5 6 28 NA 14.9 66 FALSE
## 7 5 7 23 299 8.6 65 FALSE
## 8 5 8 19 99 13.8 59 FALSE
## 9 5 9 8 19 20.1 61 FALSE
## 10 5 10 NA 194 8.6 69 FALSE
## # … with 143 more rows
# changing row order
# sorting a single vector
aq <- data.table(airquality)
sort(aq$Wind)## [1] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6
## [38] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7
## [75] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [112] 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
## [149] 9 9 9 9 9
sort(aq$Wind, decreasing = TRUE)## [1] 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8
## [38] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 7 7 7
## [75] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [112] 6 6 6 6 6 6 6 6 6 6 6 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [149] 5 5 5 5 5
sort(c("C", "B", "A", "D"))## [1] "A" "B" "C" "D"
# data.table
aq[order(Wind), ] # ascending order## Month Day Ozone Solar.R Wind Temp Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: NA NA 14.3 56 5 5 FALSE
## ---
## 149: 30 193 6.9 70 9 26 FALSE
## 150: NA 145 13.2 77 9 27 FALSE
## 151: 14 191 14.3 75 9 28 FALSE
## 152: 18 131 8.0 76 9 29 FALSE
## 153: 20 223 11.5 68 9 30 FALSE
aq[order(-Wind), ] # descending order## Month Day Ozone Solar.R Wind Temp Hot
## 1: 96 167 6.9 91 9 1 TRUE
## 2: 78 197 5.1 92 9 2 TRUE
## 3: 73 183 2.8 93 9 3 TRUE
## 4: 91 189 4.6 93 9 4 TRUE
## 5: 47 95 7.4 87 9 5 FALSE
## ---
## 149: NA NA 8.0 57 5 27 FALSE
## 150: 23 13 12.0 67 5 28 FALSE
## 151: 45 252 14.9 81 5 29 FALSE
## 152: 115 223 5.7 79 5 30 FALSE
## 153: 37 279 7.4 76 5 31 FALSE
aq[order(Temp, -Wind), ]## Month Day Ozone Solar.R Wind Temp Hot
## 1: 96 167 6.9 91 9 1 TRUE
## 2: 39 83 6.9 81 8 1 FALSE
## 3: 135 269 4.1 84 7 1 FALSE
## 4: NA 286 8.6 78 6 1 FALSE
## 5: 41 190 7.4 67 5 1 FALSE
## ---
## 149: NA 138 8.0 83 6 30 FALSE
## 150: 115 223 5.7 79 5 30 FALSE
## 151: 85 188 6.3 94 8 31 TRUE
## 152: 59 254 9.2 81 7 31 FALSE
## 153: 37 279 7.4 76 5 31 FALSE
# dplyr
aq %>% arrange(Wind) # ascending order## Month Day Ozone Solar.R Wind Temp Hot
## 1: 41 190 7.4 67 5 1 FALSE
## 2: 36 118 8.0 72 5 2 FALSE
## 3: 12 149 12.6 74 5 3 FALSE
## 4: 18 313 11.5 62 5 4 FALSE
## 5: NA NA 14.3 56 5 5 FALSE
## ---
## 149: 30 193 6.9 70 9 26 FALSE
## 150: NA 145 13.2 77 9 27 FALSE
## 151: 14 191 14.3 75 9 28 FALSE
## 152: 18 131 8.0 76 9 29 FALSE
## 153: 20 223 11.5 68 9 30 FALSE
aq %>% arrange(-Wind) # descending order## Month Day Ozone Solar.R Wind Temp Hot
## 1: 96 167 6.9 91 9 1 TRUE
## 2: 78 197 5.1 92 9 2 TRUE
## 3: 73 183 2.8 93 9 3 TRUE
## 4: 91 189 4.6 93 9 4 TRUE
## 5: 47 95 7.4 87 9 5 FALSE
## ---
## 149: NA NA 8.0 57 5 27 FALSE
## 150: 23 13 12.0 67 5 28 FALSE
## 151: 45 252 14.9 81 5 29 FALSE
## 152: 115 223 5.7 79 5 30 FALSE
## 153: 37 279 7.4 76 5 31 FALSE
# or
aq %>% arrange(desc(Wind))## Month Day Ozone Solar.R Wind Temp Hot
## 1: 96 167 6.9 91 9 1 TRUE
## 2: 78 197 5.1 92 9 2 TRUE
## 3: 73 183 2.8 93 9 3 TRUE
## 4: 91 189 4.6 93 9 4 TRUE
## 5: 47 95 7.4 87 9 5 FALSE
## ---
## 149: NA NA 8.0 57 5 27 FALSE
## 150: 23 13 12.0 67 5 28 FALSE
## 151: 45 252 14.9 81 5 29 FALSE
## 152: 115 223 5.7 79 5 30 FALSE
## 153: 37 279 7.4 76 5 31 FALSE
aq %>% arrange(Temp, desc(Wind))## Month Day Ozone Solar.R Wind Temp Hot
## 1: 96 167 6.9 91 9 1 TRUE
## 2: 39 83 6.9 81 8 1 FALSE
## 3: 135 269 4.1 84 7 1 FALSE
## 4: NA 286 8.6 78 6 1 FALSE
## 5: 41 190 7.4 67 5 1 FALSE
## ---
## 149: NA 138 8.0 83 6 30 FALSE
## 150: 115 223 5.7 79 5 30 FALSE
## 151: 85 188 6.3 94 8 31 TRUE
## 152: 59 254 9.2 81 7 31 FALSE
## 153: 37 279 7.4 76 5 31 FALSE
# filling in missing values
aq$Month[c(2:3, 36:39, 70)] <- NA
# data.table
aq[, Month := data.table::nafill(Month, "locf")] # fill the missing values with the last non-missing entry
aq[, Month := nafill(Month, "nocb")] # fill the missing values with the next non-missing entry
# dplyr
aq %>% fill(Month) -> aq
aq %>% fill(Month, .direction = "up") -> aqCharacter values in R can be stored as scalars, vectors, or matrices,
or they can be columns of a data frame or elements of a list. When
applied to objects like this, the length function will
report the number of character values in the object, not the number of
characters in each string. To find the number of characters in a
character value, the nchar function can be used. Like most
functions in R, nchar is vectorized.
Like other objects in R, character values will be displayed when
their name is typed at the console or when they are passed to the print
function. However, it is often more convenient to print or display these
objects directly without the subscripts that the print function
provides. The cat function will combine character values
and print them to the screen or a file directly. The cat
function coerces its arguments to character values, then concatenates
and displays them. This makes the function ideal for printing messages
and warnings from inside of functions.
cat will always print a newline when it encounters a
newline character. When there are multiple strings passed to
cat, or when the argument to cat is a vector
of character strings, the fill= argument can be used to
automatically insert newlines into the output string. If
fill= is set to TRUE, the value of the system
width option will be used to determine the linesize; if a numeric value
is used, the output will be displayed using that width, although cat
will not insert newlines into individual elements of its input.
The cat function also accepts a file= argument to
specify that its output should be directed to a file. When the
file= argument is used, the append=TRUE
argument can also be provided to have cat append its output to an
already existing file. For more control over the way that character
values are concatenated, the paste function can be used. In
its simplest usage, this function will accept an unlimited number of
scalars, and join them together, separating each scalar with a space by
default. To use a character string other than a space as a separator,
the sep= argument can be used. If any object passed to
paste is not of mode character, it is converted to character. If a
character vector is passed to paste, the collapse= argument
can be used to specify a character string to place between each element
of the vector.
When multiple arguments are passed to paste, it will
vectorize the operation, recycling shorter elements when necessary. This
makes it easy to generate variable names with a common prefix.
Individual characters of character values are not accessible through
ordinary subscripting. Instead, the substring function can
be used either to extract parts of character strings, or to change the
values of parts of character strings. In addition to the string being
operated on, substring accepts a first=
argument giving the first character of the desired
substring, and a last= argument giving the
last character. If not specified, last= defaults to a large
number, so that specifying just a start= value will operate
from that character to the end of the string. Like most functions in R,
substring is vectorized, operating on multiple strings at
once. In the case of strings that have fewer characters than specified
in the last= argument, substring returns as
many characters as it finds with no padding provided.
For finding locations of particular characters within a character
string, the string first needs to be converted to a character vector
containing individual characters. This can be done by passing a vector
consisting of all the characters to be processed as both the
first= and last= arguments, and then applying
which to the result.
Regular expressions are supported in the R functions
strsplit, grep, sub, and
gsub, as well as in the regexpr and
gregexpr functions which are the main tools for working
with regular expressions in R. Regular expression syntax varies
depending on the particular implementation a program uses. R tries to
provide a great deal of flexibility regarding the regular expressions it
understands. By default, R uses a basic set of regular expressions
similar to those used by UNIX utilities like grep. The
extended=TRUE argument to R functions that support regular
expressions extend the set of regular expressions to include those
supported by the POSIX 1003.2 standard. To use regular expressions like
those supported by scripting languages such as perl and python, the
perl=TRUE argument can be used.
The backslash character (\) is used in regular
expressions to signal that certain characters with special meaning in
regular expressions should be treated as normal characters. In R, this
means that two backslash characters need to be entered into an input
string anywhere that special characters need to be escaped. Although the
double backslash will display when the string is printed,
nchar or cat can verify that only a single
backslash is actually included in the string.
Single backslashes, like those which are part of a newline character
(\n), will be interpreted correctly inside of regular
expressions.
The
strsplit
function can use a character string or regular expression to divide up a
character string into smaller pieces. The first argument to
strsplit is the character string to break up, and the
second argument is the character value or regular expression which
should be used to break up the string into parts.Like other functions
that can return different numbers of elements from their inputs,
strsplit returns its results as a list, even when its input
is a single character string. Because strsplit can accept
regular expressions to decide where to split a character string, a wide
variety of situations can be easily handled.
The grep function accepts a regular expression and a
character string or vector of character strings, and returns the indices
of those elements of the strings which are matched by the regular
expression. If the value=TRUE argument is passed to grep,
it will return the actual strings which matched the expression instead
of the indices. If the string to be matched should be interpreted
literally (i.e., not as a regular expression), the
fixed=TRUE argument should be used. One important use of
grep is to extract a set of variables from a data frame
based on their names. To create a data frame with just these variables,
we can use the output of grep as a subscript. To find
regular expressions without regard to the case (upper or lower) of the
input, the ignore.case=TRUE argument can be used.
Surrounding a string with escaped angle brackets
(\\< and \\>) restricts matches to the case where the
string is surrounded by either white space, punctuation, or a line
ending or beginning. If the regular expression passed to grep is not
matched in any of its inputs, grep returns an empty numeric
vector. Thus, the any function can be used to test if a regular
expression occurs anywhere in a vector of strings.
While the grep function can be used to test for the
presence of a regular expression, sometimes more details regarding the
matches that are found are needed. In R, the regexpr and
gregexpr functions can be used to pinpoint and possibly
extract those parts of a string that were matched by a regular
expression. The output from these functions is a vector of starting
positions of the regular expressions which were found; if no match
occurred, a value of -1 is returned. In addition, an attribute called
match.length is associated with the vector of starting positions to
provide information about exactly which characters were involved in the
match. The regexpr function will only provide information
about the first match in its input string(s), while the
gregexpr function returns information about all matches
found. The input arguments to regexpr and
gregexpr are similar to those of grep; however, the
ignore.case=TRUE argument is not available in versions of R
earlier than version 2.6.
Since regexpr only reports the first match it finds, it
will always return a vector, with -1 in those positions where no match
was found. To extract the strings that actually matched,
substr can be used, after calculating the ending position
from the regexpr output and the match.length
attribute.
For substituting text based on regular expressions, R provides two
functions: sub and gsub. Each of these
functions accepts a regular expression, a string containing what will be
substituted for the regular expression, and the string or strings to
operate on. The sub function changes only the first
occurrence of the regular expression, while the gsub
function performs the substitution on all occurrences within the string.
One important use of these functions concerns numeric data which is read
from text sources like web pages or financial reports, and which may
contain commas or dollar signs.
When using the substitution functions, a powerful feature known as
tagging of regular expressions is available. When part of a regular
expression is surrounded by (unescaped) parentheses, that part can be
used in a substitution pattern by representing it as a backslash
followed by a number. The first tagged pattern is represented by
\\1, the second by \\2, and so on. To extract
just the tagged pattern from a regular expression, one possibility is to
use the regular expression beginning and end anchor characters
(^ and $, respectively) to account for all the
nontagged characters in the string, and specify just the tagged
expression for the substitution string.
If you wish to create a number of similar strings based on
information from other variables, you can use sprintf,
which allows you to write a string using %s as a
placeholder for the values that should be pulled from other
variables.
# finding the lengths of the names of the states of USA
nchar(state.name)## [1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9 11 8
## [26] 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12 9 5 4 7 8 10 13 9 7
# using cat function
x = 7
y = 10
cat("x should be greater than y, but x=',x,' and y=',y,' x should be greater than y, but x = 7 and y = 10")## x should be greater than y, but x=',x,' and y=',y,' x should be greater than y, but x = 7 and y = 10
# with fill argument
cat("Long strings can", "be displayed over", "several lines using", "the fill= argument", fill = 40)## Long strings can be displayed over
## several lines using the fill= argument
# using paste
paste("one", 2, "three", 4, "five")## [1] "one 2 three 4 five"
paste(c("one", "two", "three", "four"), collapse = " ")## [1] "one two three four"
paste(c("X", "Y"), 1:5, sep = "")## [1] "X1" "Y2" "X3" "Y4" "X5"
paste(c("X", "Y"), 1:5, sep = "_", collapse = "|")## [1] "X_1|Y_2|X_3|Y_4|X_5"
paste(c("X", "Y"), 1:5, sep = "_") # with space -> no collapse argument## [1] "X_1" "Y_2" "X_3" "Y_4" "X_5"
# working with parts of character values
substring(state.name, 2, 6)## [1] "labam" "laska" "rizon" "rkans" "alifo" "olora" "onnec" "elawa" "lorid"
## [10] "eorgi" "awaii" "daho" "llino" "ndian" "owa" "ansas" "entuc" "ouisi"
## [19] "aine" "aryla" "assac" "ichig" "innes" "issis" "issou" "ontan" "ebras"
## [28] "evada" "ew Ha" "ew Je" "ew Me" "ew Yo" "orth " "orth " "hio" "klaho"
## [37] "regon" "ennsy" "hode " "outh " "outh " "ennes" "exas" "tah" "ermon"
## [46] "irgin" "ashin" "est V" "iscon" "yomin"
# finding location of particular characters
state <- "Mississippi"
ll <- nchar(state)
ltrs <- substring(state, 1:ll, 1:ll)
ltrs## [1] "M" "i" "s" "s" "i" "s" "s" "i" "p" "p" "i"
which(ltrs == "s")## [1] 3 4 6 7
# regular expressions
expr <- ".*\\.txt"
nchar(expr)## [1] 7
cat(expr, "\n")## .*\.txt
# breaking apart character values
sentence <- "R is a free software environment for statistical computing"
parts <- strsplit(sentence, " ")
parts## [[1]]
## [1] "R" "is" "a" "free" "software"
## [6] "environment" "for" "statistical" "computing"
length(parts)## [1] 1
length(parts[[1]])## [1] 9
sapply(parts, length)## [1] 9
allparts <- unlist(parts)
allparts## [1] "R" "is" "a" "free" "software"
## [6] "environment" "for" "statistical" "computing"
str <- "one two three four"
strsplit(str, " +")## [[1]]
## [1] "one" "two" "three" "four"
# using regex
grep("^pop", names(LifeCycleSavings)) # indices## [1] 2 3
grep("^pop", names(LifeCycleSavings), value = TRUE) # values## [1] "pop15" "pop75"
# creating a data frame using grep as a subscript
head(LifeCycleSavings[, grep("^pop", names(LifeCycleSavings))])## pop15 pop75
## Australia 29.35 2.87
## Austria 23.32 4.41
## Belgium 23.80 4.43
## Bolivia 41.89 1.67
## Brazil 42.19 0.83
## Canada 31.72 2.85
# ignoring upper or lower case
inp <- c("run dog run", "work doggedly", "CAT AND DOG")
grep("\\<dog\\>", inp, ignore.case = TRUE)## [1] 1 3
# checking if a regex occurs anywhere in text
str1 <- c("The R Foundation", "is a not for profit organization", "working in the public interest")
str2 <- c(" It was founded by the members", "of the R Core Team in order", "to provide support for the R project")
any(grep("profit", str1))## [1] TRUE
any(grep("profit", str2))## [1] FALSE
# using regexpr
tst <- c("one x7 two b1", "three c5 four b9", "five six seven", "a8 eight nine")
wh <- regexpr("[a-z][0-9]", tst)
wh## [1] 5 7 -1 1
## attr(,"match.length")
## [1] 2 2 -1 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
res <- substring(tst, wh, wh + attr(wh, "match.length") - 1)
res## [1] "x7" "c5" "" "a8"
# using gregexpr
wh1 <- gregexpr("[a-z][0-9]", tst)
wh1## [[1]]
## [1] 5 12
## attr(,"match.length")
## [1] 2 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 7 15
## attr(,"match.length")
## [1] 2 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 1
## attr(,"match.length")
## [1] 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
res1 = list()
for (i in 1:length(wh1)) {
res1[[i]] <- substring(
tst[i], wh1[[i]],
wh1[[i]] +
attr(wh1[[i]], "match.length") - 1
)
}
res1## [[1]]
## [1] "x7" "b1"
##
## [[2]]
## [1] "c5" "b9"
##
## [[3]]
## [1] ""
##
## [[4]]
## [1] "a8"
# substitutions
values <- c("$11,317.35", "$11,234.51", "$11,275.89", "$11,278.93", "$11,294.94")
as.numeric(gsub("[$,]", "", values))## [1] 11317.35 11234.51 11275.89 11278.93 11294.94
# tagging
values <- c("75.99", "(20.30)", "55.20")
as.numeric(gsub("\\(([0-9.]+)\\)", "-\\1", values))## [1] 75.99 -20.30 55.20
# extracting the tagged pattern
str <- "report: 17 value=12 time=2:00"
sub("value=([^ ]+)", "\\1", str)## [1] "report: 17 12 time=2:00"
sub("^.*value=([^ ]+).*$", "\\1", str)## [1] "12"
# using variables into strings
names <- c("Irma", "Bea", "Lisa")
ages <- c(5, 59, 36)
sprintf("%s is %s years old.", names, ages)## [1] "Irma is 5 years old." "Bea is 59 years old." "Lisa is 36 years old."
Para trabajar con la manipulación de cadenas de texto usamos el
paquete stringr, cuyas funciones siempre empiezan por
str_*, seguidas por un verbo y el primer argumento, que
siempre es un vector de caracteres. La mayoría de las funciones
str_* usan expresiones regulares.
# data
geo <- pull(emisiones, 1)
# how many characters has every element
head( str_length(geo), 10)## [1] 36 73 7 8 14 7 48 7 7 6
# to lower
head (str_to_lower(geo), 10)## [1] "european union (current composition)"
## [2] "european union (current composition) and iceland under the kyoto protocol"
## [3] "belgium"
## [4] "bulgaria"
## [5] "czech republic"
## [6] "denmark"
## [7] "germany (until 1990 former territory of the frg)"
## [8] "estonia"
## [9] "ireland"
## [10] "greece"
# joining characters
str_c (geo[26 : 35], collapse = ", ")## [1] "Slovenia, Slovakia, Finland, Sweden, United Kingdom, Iceland, Liechtenstein, Norway, Switzerland, Turkey"
str_c (geo[ 26 : 35], 1:10, sep = "_" )## [1] "Slovenia_1" "Slovakia_2" "Finland_3" "Sweden_4"
## [5] "United Kingdom_5" "Iceland_6" "Liechtenstein_7" "Norway_8"
## [9] "Switzerland_9" "Turkey_10"
# extracting characters between two indices
str_sub(geo[ 26 : 35], 1, 3)## [1] "Slo" "Slo" "Fin" "Swe" "Uni" "Ice" "Lie" "Nor" "Swi" "Tur"
# str_ and regex
str_subset(geo, "[p]")## [1] "European Union (current composition)"
## [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
## [3] "Czech Republic"
## [4] "Spain"
## [5] "Cyprus"
str_subset(geo, "[pl]")## [1] "European Union (current composition)"
## [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
## [3] "Belgium"
## [4] "Bulgaria"
## [5] "Czech Republic"
## [6] "Germany (until 1990 former territory of the FRG)"
## [7] "Ireland"
## [8] "Spain"
## [9] "Italy"
## [10] "Cyprus"
## [11] "Malta"
## [12] "Netherlands"
## [13] "Poland"
## [14] "Portugal"
## [15] "Slovenia"
## [16] "Slovakia"
## [17] "Finland"
## [18] "Iceland"
## [19] "Switzerland"
str_subset(geo, "^E")## [1] "European Union (current composition)"
## [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
## [3] "Estonia"
str_subset(geo, "E$")## character(0)
# counting
str_count(geo, "[ou]")## [1] 7 13 1 1 1 0 4 1 0 0 0 0 1 0 1 0 1 3 1 0 0 1 1 2 1
## [26] 1 1 0 0 1 0 0 1 0 1
# detecting a pattern
str_detect(geo, "^L")## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
# extracting a pattern
str_extract(geo, "Euro")## [1] "Euro" "Euro" NA NA NA NA NA NA NA NA
## [11] NA NA NA NA NA NA NA NA NA NA
## [21] NA NA NA NA NA NA NA NA NA NA
## [31] NA NA NA NA NA
str_extract(geo, "[euro]")## [1] "u" "u" "e" "u" "e" "e" "e" "o" "r" "r" NA "r" "r" NA "r" NA "u" "u" "u"
## [20] NA "e" "u" "o" "o" "o" "o" "o" NA "e" "e" "e" "e" "o" "e" "u"
# replacing characters
str_replace(geo, "\\(current composition\\)" , "")## [1] "European Union "
## [2] "European Union and Iceland under the Kyoto Protocol"
## [3] "Belgium"
## [4] "Bulgaria"
## [5] "Czech Republic"
## [6] "Denmark"
## [7] "Germany (until 1990 former territory of the FRG)"
## [8] "Estonia"
## [9] "Ireland"
## [10] "Greece"
## [11] "Spain"
## [12] "France"
## [13] "Croatia"
## [14] "Italy"
## [15] "Cyprus"
## [16] "Latvia"
## [17] "Lithuania"
## [18] "Luxembourg"
## [19] "Hungary"
## [20] "Malta"
## [21] "Netherlands"
## [22] "Austria"
## [23] "Poland"
## [24] "Portugal"
## [25] "Romania"
## [26] "Slovenia"
## [27] "Slovakia"
## [28] "Finland"
## [29] "Sweden"
## [30] "United Kingdom"
## [31] "Iceland"
## [32] "Liechtenstein"
## [33] "Norway"
## [34] "Switzerland"
## [35] "Turkey"
geo2 <- str_c(geo[26 : 35], 1 :10, sep="_")
geo2## [1] "Slovenia_1" "Slovakia_2" "Finland_3" "Sweden_4"
## [5] "United Kingdom_5" "Iceland_6" "Liechtenstein_7" "Norway_8"
## [9] "Switzerland_9" "Turkey_10"
str_replace(geo2, "_[0-9]{1,2}", "")## [1] "Slovenia" "Slovakia" "Finland" "Sweden"
## [5] "United Kingdom" "Iceland" "Liechtenstein" "Norway"
## [9] "Switzerland" "Turkey"
# creating the format "leading zero"
str_pad(1:12, 2, "left", "0")## [1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"
Conceptually, factors are variables in R which take on a limited number of different values; such variables are often referred to as categorical variables. One of the most important uses of factors is in statistical modeling; since categorical variables enter into statistical models differently than continuous variables, storing data as factors insures that the modeling functions will treat such data correctly.
Factors in R are stored as a vector of integer values with a
corresponding set of character values to use when the factor is
displayed. The factor function is used to create a factor. The only
required argument to factor is a vector of values which will be returned
as a vector of factor values. Both numeric and character variables can
be made into factors, but a factor’s levels will always be character
values. You can see the possible levels for a factor by calling the
levels function; the nlevels function will
return the number of levels of a factor.
To change the order in which the levels will be displayed from their
default sorted order, the levels= argument can be given a
vector of all the possible values of the variable in the order you
desire. If the ordering should also be used when performing comparisons,
use the optional ordered=TRUE argument.In this case, the
factor is known as an ordered factor.
The levels of a factor are used when displaying the factor’s values.
You can change these levels at the time you create a factor by passing a
vector with the new values through the labels= argument.
Note that this actually changes the internal levels of the factor, and
to change the labels of a factor after it has been created, the
assignment form of the levels function is used.
Factors represent a very efficient way to store character values,
because each unique character value is stored only once, and the data
itself is stored as a vector of integers. Because of this,
read.table will automatically convert character variables
to factors unless the as.is=TRUE or
stringsAsFactors=FALSE arguments are specified, or the
stringsAsFactors system option is set to
FALSE. Comparison operators are not supported for unordered
factors. The order in which the levels are displayed is determined by
the order in which they appear in the levels= argument to
factor. Sometimes, a factor needs to be reordered on the basis of some
property of that factor. The reorder function takes three
arguments: a factor, a vector of values on which the reordering is
based, and a function to operate on those values for each factor level.
When reorder is used, it assigns an attribute called scores
which contains the value used for the reordering.
For some statistical procedures, the interpretation of results can be
simplified by forcing a particular order to a factor; in particular, it
may be useful to choose a “reference” level, which should be the first
level of the factor. The relevel function allows you to
choose a reference level, which will then be treated as the first level
of the factor.
While it may be necessary to convert a numeric variable to a factor
for a particular application, it is often very useful to convert the
factor back to its original numeric values, since even simple arithmetic
operations will fail when using factors. Since the
as.numeric function will simply return the internal integer
values of the factor, the conversion must be done using the levels
attribute of the factor, or by first converting the factor to a
character value using as.character.
When a factor is first created, all of its levels are stored along
with the factor, and if subsets of the factor are extracted, they will
retain all of the original levels. This can create problems when
constructing model matrices and may or may not be useful when displaying
the data using, say, the table function. To change this, we
can use the drop=TRUE argument to the subscripting
operator. When used with factors, this argument will remove the unused
levels.
To exclude certain levels from appearing in a factor, the
exclude= argument can be passed to factor. By default, the
missing value (NA) is excluded from factor levels; to
create a factor that includes missing values from a numeric variable,
use exclude=NULL.
Care must be taken when combining variables which are factors,
because the c function will interpret the factors as
integers. To combine factors, they should first be converted back to
their original values (through the levels function), then
concatenated and converted to a new factor.
The cut function is used to convert a numeric variable
into a factor. The breaks= argument to cut is used to
describe how ranges of numbers will be converted to factor values. If a
number is provided through the breaks= argument, the
resulting factor will be created by dividing the range of the variable
into that number of equal-length intervals; if a vector of values is
provided, the values in the vector are used to determine the
breakpoints. Note that if a vector of values is provided, the number of
levels of the resultant factor will be one less than the number of
values in the vector. Notice that the default label for factors produced
by cut contains the actual range of values that were used to divide the
variable into factors. The pretty function can be used to
choose cut points that are round numbers, but it may not return the
number of levels that’s actually desired. The labels=
argument to cut allows you to specify the levels of the factors. To
produce factors based on percentiles of your data (for example,
quartiles or deciles), the quantile function can be used to
generate the breaks= argument, insuring nearly equal
numbers of observations in each of the levels of the factor.
If you wish to create a factor based on one of the components of that
date, you can extract it with strftime and convert it to a
factor directly. Sometimes more flexibility can be achieved by using the
cut function, which understands time units of months, days, weeks, and
years through the breaks= argument. (For date/time values,
units of hours, minutes, and seconds can also be used.). By default, cut
starts weeks on Mondays; to use Sundays instead, pass the
start.on.monday=FALSE argument to cut. Multiples of units
can also be specified through the breaks= argument.
Sometimes it is useful to treat all combinations of several factors
as if they were a single factor. In situations like these, the
interaction function can be used. This function will take
two or more factors, and create a new, unordered factor whose levels
correspond to the combinations of the levels of the input factors.
interaction’s default behavior is to include all possible combinations
of its input factors. To retain only those combinations for which there
were observations, the drop=TRUE argument can be passed to
interaction. By default, interaction forms levels for the
new factor by joining the levels of its component factors with a period
(.). This can be overridden with the sep=
argument.
data <- c(1, 2, 2, 3, 1, 2, 3, 3, 1, 2, 3, 3, 1)
fdata <- factor(data)
fdata## [1] 1 2 2 3 1 2 3 3 1 2 3 3 1
## Levels: 1 2 3
# modifying levels in factors
rdata <- factor(data, labels = c("I", "II", "III"))
rdata## [1] I II II III I II III III I II III III I
## Levels: I II III
levels(fdata) <- c("I", "II", "III")
fdata## [1] I II II III I II III III I II III III I
## Levels: I II III
# unordered factor
mons <- c("March", "April", "January", "November", "January", "September", "October", "September", "November", "August", "January", "November", "November", "February", "May", "August", "July", "December", "August", "August", "September", "November", "February", "April")
mons <- factor(mons)
table(mons)## mons
## April August December February January July March May
## 2 4 1 2 3 1 1 1
## November October September
## 5 1 3
# ordered factor
mons <- factor(mons, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), ordered = TRUE)
mons[1] < mons[2]## [1] TRUE
table(mons)## mons
## January February March April May June July August
## 3 2 1 2 1 0 1 4
## September October November December
## 3 1 5 1
# reordering levels
levels(InsectSprays$spray)## [1] "A" "B" "C" "D" "E" "F"
InsectSprays$spray <- with(InsectSprays, reorder(spray, count, mean))
levels(InsectSprays$spray)## [1] "C" "E" "D" "A" "B" "F"
attr(InsectSprays$spray, "scores")## A B C D E F
## 14.500000 15.333333 2.083333 4.916667 3.500000 16.666667
# forcing a particular order to a factor with relevel
levels(InsectSprays$spray)## [1] "C" "E" "D" "A" "B" "F"
InsectSprays$spray <- relevel(InsectSprays$spray, "A")
levels(InsectSprays$spray)## [1] "A" "C" "E" "D" "B" "F"
# converting factors to numeric
fert <- c(10, 20, 20, 50, 10, 20, 10, 50, 20)
fert <- factor(fert, levels = c(10, 20, 50), ordered = TRUE)
fert## [1] 10 20 20 50 10 20 10 50 20
## Levels: 10 < 20 < 50
mean(as.numeric(levels(fert)[fert]))## [1] 23.33333
mean(as.numeric(as.character(fert)))## [1] 23.33333
# dropping unused levels
lets <- sample(letters, size = 100, replace = TRUE)
lets <- factor(lets)
table(lets[1:5])##
## a b c d e f g h i j k l m n o p q r s t u v w x y
## 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0
table(lets[1:5, drop = TRUE])##
## f i q u v
## 1 1 1 1 1
# or
table(factor(lets[1:5]))##
## f i q u v
## 1 1 1 1 1
# combining factors
fact1 <- factor(sample(letters, size = 10, replace = TRUE))
fact2 <- factor(sample(letters, size = 10, replace = TRUE))
fact12 <- factor(c(levels(fact1)[fact1], levels(fact2)[fact2]))
fact12## [1] v d j c k i r d b n n g t a m i n z e r
## Levels: a b c d e g i j k m n r t v z
# creating factors from continuous variables
wfact <- cut(women$weight, 3)
table(wfact)## wfact
## (115,131] (131,148] (148,164]
## 6 5 4
wfact <- cut(women$weight, pretty(women$weight, 3))
wfact## [1] (100,120] (100,120] (100,120] (120,140] (120,140] (120,140] (120,140]
## [8] (120,140] (120,140] (140,160] (140,160] (140,160] (140,160] (140,160]
## [15] (160,180]
## Levels: (100,120] (120,140] (140,160] (160,180]
wfact <- cut(women$weight, 3, labels = c("Low", "Medium", "High"))
table(wfact)## wfact
## Low Medium High
## 6 5 4
wfact <- cut(women$weight, quantile(women$weight, (0:4) / 4))
table(wfact)## wfact
## (115,124] (124,135] (135,148] (148,164]
## 3 4 3 4
# creating factors based on dates and times
everyday <- seq(from = as.Date("2005-1-1"), to = as.Date("2005-12-31"), by = "day")
cmonth <- format(everyday, "%b")
months <- factor(cmonth, levels = unique(cmonth), ordered = TRUE)
table(months)## months
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 31 28 31 30 31 30 31 31 30 31 30 31
# with cut
wks <- cut(everyday, breaks = "week")
head(wks)## [1] 2004-12-27 2004-12-27 2005-01-03 2005-01-03 2005-01-03 2005-01-03
## 53 Levels: 2004-12-27 2005-01-03 2005-01-10 2005-01-17 ... 2005-12-26
qtrs <- cut(everyday, "3 months", labels = paste("Q", 1:4, sep = ""))
head(qtrs)## [1] Q1 Q1 Q1 Q1 Q1 Q1
## Levels: Q1 Q2 Q3 Q4
# interactions
nlevels(CO2$Plant)## [1] 12
nlevels(CO2$Type)## [1] 2
newfact <- interaction(CO2$Plant, CO2$Type)
nlevels(newfact)## [1] 24
newfact1 <- interaction(CO2$Plant, CO2$Type, drop = TRUE, sep = "_")
nlevels(newfact1)## [1] 12
levels(newfact1)## [1] "Qn1_Quebec" "Qn2_Quebec" "Qn3_Quebec" "Qc1_Quebec"
## [5] "Qc3_Quebec" "Qc2_Quebec" "Mn3_Mississippi" "Mn2_Mississippi"
## [9] "Mn1_Mississippi" "Mc2_Mississippi" "Mc3_Mississippi" "Mc1_Mississippi"
# converting multiple numeric variables to factor (no data)
# cols <- c(35:74)
# df.j[, cols] <- lapply(df.j[, cols], factor)
# assigning levels to factor variables (no data)
# btw2009 <- within (btw2009, levels(stateA) <- c("BW", "BY", "BE","BB", "HB", "HH", "HE", "MV", "NI", "NW","RP", "SL", "SN", "ST", "SH", "TH"))
# re-ordering levels and assigning to a new factor variable
# ls <- with(btw9s, Bundesland[order(EW, -Voters)]) # reorder levels and supress one var
# btw9s <- within(btw9s, State1 <- factor(Bundesland, levels=ls)) # create a new factor var and assign levels
# re-assigning ordered levels to variables in a data frame
levels(Arthritis$Improved)## [1] "None" "Some" "Marked"
Arthritis$Improved <- ordered(Arthritis$Improved, levels = c("None", "Some", "Marked"))
# or
# exp1_long$condition <- factor(exp1_long$condition, levels = c("No_communication", "High_confidence", "Medium_confidence", "Low_confidence"))
# in table form there are occasions when you need numeric values for the levels of ordered factors in a table (no data)
# Simply re-assign the dimnames attribute of the table variables
# dimnames(JobSat)$income <- c(7.5, 20, 32.5, 60)
# dimnames(JobSat)$satisfaction <- 1:4
# You want to preserve the character labels of table dimensions, but also
# allow them to be sorted in some particular order
# dimnames(JobSat)$income <- paste(1:4, dimnames(JobSat)$income, sep = ":")
# dimnames(JobSat)$satisfaction <- paste(1:4, dimnames(JobSat)$satisfaction, sep = ":")A factor is a data structure in R that allows you to create a set of
categories. We call these categories levels. It is well
known in the psychological literature that we can only store a certain
number of things in our working memory. Therefore, to help people make
sense of categories, we shouldn’t show too many of them. This is where
the strength of lumping factors shows. Lumping is nothing more than
combining factor levels into a new and larger category. Factor variables
are very useful but not very easy to manipulate.
forcats contains very useful functions that make
working on factor variables painless. The four following functions,
fct_recode(), fct_relevel(),
fct_reorder() and fct_relabel(), are the ones
you must know. fct_reorder() is especially useful for
plotting. fct_lump*() functions make it possible to lump
several levels of a factor into a new other level. The tidyverse team no
longer recommends the use of this function. Instead, we can use the new
functions created in 2020:
fct_lump_min: lump levels that do not occur more than
min times.fct_lump_n: lumps n of the most or least frequently
occurring levels.fct_lump_prop: lumps of levels that occur at most n
times * prop.fct_lump_lowfreq: lumps the least frequent levels.fct_lump_min summarizes all levels that do not appear
more than min times. Compared to fct_lump_min,
fct_lump_n is not about the number of levels. Instead, it
simply keeps the most frequent levels or the least frequent levels. We
have to decide what to do with the levels that occur the same number of
times. If you don’t give the function any additional information,
fct_lump_n will show you all the levels whose number falls
below the last level, which is clearly one of the most frequent levels.
You can change this behavior with the ties.method argument. The default
argument is min. The other options are “average”, “first”,
“last”, “random” and “max”.
fct_lump_prop represents the percentage at which a
particular level occurs within the total number of levels.
Some of these four functions cause the Other level not
to be the least common level. fct_lump_lowfreq simply
ensures that so many levels are grouped together that the “Other” is
still the least frequent level. It has no additional arguments except
other_level, which is used to specify the name of the
“other” level.
head(gss_cat)## # A tibble: 6 × 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near r… Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str re… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independent Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near r… Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str de… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong dem… Prot… Sout… NA
str(gss_cat$marital)## Factor w/ 6 levels "No answer","Never married",..: 2 4 5 2 4 6 2 4 6 6 ...
str(gss_cat$rincome)## Factor w/ 16 levels "No answer","Don't know",..: 8 8 16 16 16 5 4 9 4 4 ...
# recoding levels
gss_cat <- gss_cat %>%
mutate(marital = fct_recode(marital,
refuse = "No answer",
never_married = "Never married",
divorced = "Separated",
divorced = "Divorced",
widowed = "Widowed",
married = "Married"
))
gss_cat %>%
janitor::tabyl(marital)## marital n percent
## refuse 17 0.0007913234
## never_married 5416 0.2521063166
## divorced 4126 0.1920588372
## widowed 1807 0.0841130196
## married 10117 0.4709305032
# lumping categories with the old not-recommend fct_lump() function
gss_cat <- gss_cat %>%
mutate(marital = fct_lump(marital, prop = 0.10, other_level = "other"))
gss_cat %>%
mutate(
# Description of the different functions taken from help(fct_lump)
denom_lowfreq = fct_lump_lowfreq(denom), # lumps together the least frequent levels, ensuring that "other" is still the smallest level.
denom_min = fct_lump_min(denom, min = 10), # lumps levels that appear fewer than min times.
denom_n = fct_lump_n(denom, n = 3), # lumps all levels except for the n most frequent (or least frequent if n < 0)
denom_prop = fct_lump_prop(denom, prop = 0.10) # lumps levels that appear in fewer prop * n times.
)## # A tibble: 21,483 × 13
## year marital age race rincome partyid relig denom tvhours denom…¹ denom…²
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <fct>
## 1 2000 never_… 26 White $8000 … Ind,ne… Prot… Sout… 12 Southe… Southe…
## 2 2000 divorc… 48 White $8000 … Not st… Prot… Bapt… NA Baptis… Baptis…
## 3 2000 other 67 White Not ap… Indepe… Prot… No d… 2 No den… No den…
## 4 2000 never_… 39 White Not ap… Ind,ne… Orth… Not … 4 Not ap… Not ap…
## 5 2000 divorc… 25 White Not ap… Not st… None Not … 1 Not ap… Not ap…
## 6 2000 married 25 White $20000… Strong… Prot… Sout… NA Southe… Southe…
## 7 2000 never_… 36 White $25000… Not st… Chri… Not … 3 Not ap… Not ap…
## 8 2000 divorc… 44 White $7000 … Ind,ne… Prot… Luth… NA Luther… Luther…
## 9 2000 married 44 White $25000… Not st… Prot… Other 0 Other Other
## 10 2000 married 47 White $25000… Strong… Prot… Sout… 3 Southe… Southe…
## # … with 21,473 more rows, 2 more variables: denom_n <fct>, denom_prop <fct>,
## # and abbreviated variable names ¹denom_lowfreq, ²denom_min
gss_cat %>%
tabyl(marital)## marital n percent
## never_married 5416 0.25210632
## divorced 4126 0.19205884
## married 10117 0.47093050
## other 1824 0.08490434
# another example
billboard %>%
ggplot(aes(y = artist)) +
geom_bar()billboard %>%
mutate(artist = fct_lump(as_factor(artist), 10)) %>%
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()# fct_lump() new functions
# using fct_lump_min()
billboard %>%
mutate(artist = fct_lump_min(as_factor(artist), 3)) %>%
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()# another example
table(gss_cat$rincome)##
## No answer Don't know Refused $25000 or more $20000 - 24999
## 183 267 975 7363 1283
## $15000 - 19999 $10000 - 14999 $8000 to 9999 $7000 to 7999 $6000 to 6999
## 1048 1168 340 188 215
## $5000 to 5999 $4000 to 4999 $3000 to 3999 $1000 to 2999 Lt $1000
## 227 226 276 395 286
## Not applicable
## 7043
gss_cat %>%
mutate(rincome = fct_lump_min(rincome, 600)) %>%
ggplot(aes(y = fct_infreq(rincome))) +
geom_bar()# using fct_lump_n()
gss_cat %>%
mutate(rincome = fct_lump_n(rincome, n = 10)) %>% # lump all levels except the 10 most frequent
ggplot(aes(y = fct_infreq(rincome))) +
geom_bar()billboard %>%
mutate(artist = fct_lump_n(artist, n = -5)) %>% # lump all the levels that occur most often (exactly the opposite of what a positive number does)
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()billboard %>%
mutate(artist = fct_lump_n(artist, 5, ties.method = "min")) %>%
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()billboard %>%
mutate(artist = fct_lump_n(artist, 5, ties.method = "max")) %>%
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()billboard %>%
mutate(artist = fct_lump_n(artist, 5, ties.method = "random")) %>%
filter(artist != "Other") %>%
ggplot(aes(y = artist)) +
geom_bar()# fct_lump_prop()
# step by step explanation
# 1. how many times all levels occur in total
(total_count_income <- gss_cat %>% count(rincome) %>%
{
sum(.$n)
}) ## [1] 21483
# 2. choose a specific income range and how often occurs
(count_one_range <- gss_cat$rincome[gss_cat$rincome == "$20000 - 24999"] %>%
length())## [1] 1283
# 3. calculating the proportion
count_one_range / total_count_income## [1] 0.05972164
# the same but with fct_lump_prop()
gss_cat %>%
mutate(rincome = fct_lump_prop(rincome, .05)) %>% # levels that occur in less than 5% of all counts
ggplot(aes(y = fct_infreq(rincome))) +
geom_bar()# checking with tidyverse
gss_cat %>%
count(rincome, name = "count_per_income_range") %>%
select(rincome, count_per_income_range) %>%
mutate(
total_count_income = sum(count_per_income_range),
percentage = count_per_income_range / total_count_income
) %>%
filter(percentage >= .05)## # A tibble: 4 × 4
## rincome count_per_income_range total_count_income percentage
## <fct> <int> <int> <dbl>
## 1 $25000 or more 7363 21483 0.343
## 2 $20000 - 24999 1283 21483 0.0597
## 3 $10000 - 14999 1168 21483 0.0544
## 4 Not applicable 7043 21483 0.328
# using fct_lump_lowfreq)
# without fct_lump_lowfreq()
gss_cat %>%
mutate(
rincome = fct_lump_n(rincome, n = 10),
color_coding_rincome = ifelse(rincome == "Other", "a", "b")
) %>%
ggplot(aes(
y = fct_infreq(rincome),
fill = color_coding_rincome
)) +
scale_fill_manual(values = c("grey20", "grey80")) +
geom_bar(show.legend = FALSE)# with fct_lump_lowfreq()
gss_cat %>%
mutate(
rincome = fct_lump_lowfreq(rincome),
color_coding_rincome = ifelse(rincome == "Other", "a", "b")
) %>%
ggplot(aes(
y = fct_infreq(rincome),
fill = color_coding_rincome
)) +
scale_fill_manual(values = c("grey20", "grey80")) +
geom_bar(show.legend = FALSE)# using fct_reorder()
gss_cat %>%
tabyl(marital) %>%
ggplot() +
geom_col(aes(y = n, x = marital)) +
coord_flip()gss_cat %>%
tabyl(marital) %>%
mutate(marital = fct_reorder(marital, n, .desc = FALSE)) %>%
ggplot() +
geom_col(aes(y = n, x = marital)) +
coord_flip()R provides several options for dealing with date and date/time data.
The builtin as.Date function handles dates (without times);
the contributed package chron handles dates and times, but
does not control for time zones; and the POSIXct and
POSIXlt classes allow for dates and times with control for
time zones. The general rule for date/time data in R is to use the
simplest technique possible. Thus, for date only data,
as.Date will usually be the best choice. If you need to
handle dates and times, without time-zone information, the
chron package is a good choice; the POSIX
classes are especially useful when time-zone manipulation is important.
Also, don’t overlook the various “as.” functions (like
as.Date and as.POSIXlt) for converting among
the different date types when necessary. Except for the
POSIXlt class, dates are stored internally as the number of
days or seconds from some reference date. Thus, dates in R will
generally have a numeric mode, and the class function can be used to
find the way they are actually being stored. The POSIXlt
class stores date/time values as a list of components (hour, min, sec,
mon, etc.) making it easy to extract these parts. To get the current
date, the Sys.Date function will return a Date object which
can be converted to a different class if necessary.
The as.Date function allows a variety of input formats
through the format= argument. The default format is a
four-digit year, followed by a month, then a day, separated by either
dashes or slashes. If your input dates are not in the standard format, a
format string can be composed using the elements shown in the following
table.
Internally, Date objects are stored as
the number of days since January 1, 1970, using
negative numbers for earlier dates. The
as.numeric function
can be used to convert a Date object to its internal form. To convert
this form back to a Date object, it can be assigned a class of Date
directly. To extract the components of the dates, the
weekdays, months, days, or
quarters functions can be used.
The chron function converts dates and times to chron
objects. The dates and times are provided to the chron
function as separate values, so some preprocessing may be necessary to
prepare input date/times for the chron function. When using
character values, the default format for dates is the decimal month
value followed by the decimal day value followed by the year, using the
slash as a separator. Alternatively, dates can be specified by a numeric
value, representing the number of days since January 1, 1970. To input
dates stored as the day of the year, the origin= argument
can be used to interpret numeric dates relative to a different date. The
default format for times consists of the hour, minutes, and seconds,
separated by colons. Often the first task when using the
chron package is to break apart the date and times if they
are stored together. Chron values are stored internally as the
fractional number of days from January 1, 1970. The
as.numeric function can be used to access the internal
values.
POSIX represents a
portable operating system interface, primarily for UNIX systems, but
available on other operating systems as well. Dates stored in the
POSIX format are date/time values (like dates with the
chron package), but also allow modification of time zones.
Unlike the chron package, which stores times as fractions
of days, the POSIX date classes store times to the nearest
second, so they provide a more accurate representation of times. There
are two POSIX date/time classes, which differ
in the way that the values are stored internally. The
POSIXct class stores date/time values as the number of
seconds since January 1, 1970, while the POSIXlt class
stores them as a list with elements for second, minute, hour, day,
month, and year, among others. Unless you need the list nature of the
POSIXlt class, the POSIXct class is the usual
choice for storing dates in R. The default input format for
POSIX dates consists of the year, followed by the month and
day, separated by slashes or dashes; for date/time values, the date may
be followed by white space and a time in the form hour:minutes:seconds
or hour:minutes. Valid POSIX date or date/time inputs:
If your input date/times are stored as the number of seconds from
January 1, 1970, you can create POSIX date values by
assigning the appropriate class directly to those values. Since many
date manipulation functions refer to the POSIXt
pseudo-class, be sure to include it in the class attribute of the
values.
The POSIX date/time classes take advantage of the POSIX
date/time implementation of your operating system, allowing dates and
times in R to be manipulated in the same way they would be in, for
example, a C program. The two most important functions in this regard
are strptime, for inputting dates, and
strftime, for formatting dates for output. Both of these
functions use a variety of formatting codes. Nonformat characters (like
the slashes) are interpreted literally. When using
strptime, an optional time zone can be specified with the
tz= option.
Since POSIX date/time values are stored internally as
the number of seconds since January 1, 1970, they can easily use times
that are not represented by a formatted version of the hour, minute, and
second. Another way to create POSIX dates is to pass the individual
components of the time to the ISOdate function.
ISOdate will accept both numeric and character
arguments.
For formatting dates for output, the format function will recognize
the type of your input date, and perform any necessary conversions
before calling strftime, so strftime rarely
needs to be called directly.
When using POSIX dates, the optional
usetz=TRUE argument to the format function can be specified
to indicate that the time zone should be displayed. Additionally,
as.POSIXlt and as.POSIXct can also accept Date
or chron objects, so they can be input as described in the
previous sections and converted as needed. Conversion between the two
POSIX forms is also possible. The individual components of
a POSIX date/time object can be extracted by first
converting to POSIXlt if necessary, and then accessing the
components directly.
Many of the statistical summary functions, like mean,
min, max, etc are able to transparently handle
date objects. If two times (using any of the date or date/time classes)
are subtracted, R will return the result in the form of a time
difference, which represents a difftime object. If an
alternative unit of time was desired, the difftime function
could be called, using the optional units= argument with
any of the following values: “auto”, “secs”, “mins”, “hours”, “days”, or
“weeks”.
Although difftime values are displayed with their units,
they can be manipulated like ordinary numeric variables; arithmetic
performed with these values will retain the original units. To convert a
time difference in days to one of years, a good approximation is to
divide the number of days by 365.25. However, the difftime
value will display the time units as days. To modify this, the units
attribute of the object can be modified.
The by= argument to the seq function can be specified
either as a difftime value, or in any units of time that
the difftime function accepts, making it very easy to
generate sequences of dates. All the date classes except for
chron will accept an integer before the interval provided
as a by= argument. The cut function also
understands units of days, weeks, months, and years, making it very easy
to create factors grouped by these units.
Format codes can also be used to extract parts of dates, as an
alternative to the weekdays and other functions. This same
technique can be used to convert dates to factors.
# using as.Date
as.Date("1915-6-16")## [1] "1915-06-16"
as.Date("1990/02/17")## [1] "1990-02-17"
as.Date("1/15/2001", format = "%m/%d/%Y")## [1] "2001-01-15"
as.Date("April 26, 2001", format = "%B %d, %Y")## [1] "2001-04-26"
as.Date("22JUN01", format = "%d%b%y")## [1] "2001-06-22"
# converting date to numeric and back to date
thedate <- as.Date("1/15/2001", format = "%m/%d/%Y")
ndate <- as.numeric(thedate)
ndate## [1] 11337
class(ndate) <- "Date"
ndate## [1] "2001-01-15"
# using chron
dtimes <- c("2002-06-09 12:45:40", "2003-01-29 09:30:40", "2002-09-04 16:45:40", "2002-11-13 20:00:40", "2002-07-07 17:30:40")
dtparts <- t(as.data.frame(strsplit(dtimes, " ")))
row.names(dtparts) <- NULL
thetimes <- chron(dates = dtparts[, 1], times = dtparts[, 2], format = c("y-m-d", "h:m:s"))
thetimes## [1] (02-06-09 12:45:40) (03-01-29 09:30:40) (02-09-04 16:45:40)
## [4] (02-11-13 20:00:40) (02-07-07 17:30:40)
# using POSIXlt
dts <- c("2005-10-21 18:47:22", "2005-12-24 16:39:58", "2005-10-28 07:30:05 PDT")
as.POSIXlt(dts)## [1] "2005-10-21 18:47:22 CEST" "2005-12-24 16:39:58 CET"
## [3] "2005-10-28 07:30:05 CEST"
# using POSIXct
dts <- c(1127056501, 1104295502, 1129233601, 1113547501, 1119826801, 1132519502, 1125298801, 1113289201)
mydates <- dts
class(mydates) <- c("POSIXt", "POSIXct")
mydates## [1] "2005-09-18 17:15:01 CEST" "2004-12-29 05:45:02 CET"
## [3] "2005-10-13 22:00:01 CEST" "2005-04-15 08:45:01 CEST"
## [5] "2005-06-27 01:00:01 CEST" "2005-11-20 21:45:02 CET"
## [7] "2005-08-29 09:00:01 CEST" "2005-04-12 09:00:01 CEST"
# or
# mydates <- structure(dts, class = c("POSIXt", "POSIXct"))
# using strptime
mydate <- strptime("16/Oct/2005:07:51:00", format = "%d/%b/%Y:%H:%M:%S")
mydate## [1] "2005-10-16 07:51:00 CEST"
# transforming times that are not represented by a formatted version
mydates <- c("20060515 112504.5", "20060518 101000.3", "20060520 20035.1")
dtparts <- t(as.data.frame(strsplit(mydates, " ")))
dtimes <- strptime(dtparts[, 1], format = "%Y%m%d")
as.numeric(dtparts[, 2])## [1] 112504.5 101000.3 20035.1
dtimes## c..20060515....112504.5.. c..20060518....101000.3.. c..20060520....20035.1..
## "2006-05-15 CEST" "2006-05-18 CEST" "2006-05-20 CEST"
# using ISOdate
ISOdate(2006, 5, 16, 7, 15, 04, tz = "PDT")## [1] "2006-05-16 07:15:04 GMT"
# formatting dates for output
thedate <- ISOdate(2005, 10, 21, 18, 47, 22, tz = "PDT")
format(thedate, "%A, %B %d, %Y %H:%M:%S")## [1] "Friday, October 21, 2005 18:47:22"
# extracting components of POSIX
mydate <- as.POSIXlt("2005-4-19 7:01:00")
names(mydate)## NULL
mydate$mday## [1] 19
# operations with dates
b1 <- ISOdate(1977, 7, 13)
b2 <- ISOdate(2003, 8, 14)
b2 - b1## Time difference of 9528 days
difftime(b2, b1, units = "weeks")## Time difference of 1361.143 weeks
# rdates <- scan(what = "")
# save(rdates, file = "input/rdates.RData")
load("input/rdates.RData")
rdates <- as.data.frame(matrix(rdates, ncol = 2, byrow = TRUE))
rdates[, 2] <- as.Date(rdates[, 2], format = "%d%b%Y")
names(rdates) <- c("Release", "Date")
rdates## Release Date
## 1 1.0 2000-02-29
## 2 1.1 2000-06-15
## 3 1.2 2000-12-15
## 4 1.3 2001-06-22
## 5 1.4 2001-12-19
## 6 1.5 2002-04-29
## 7 1.6 2002-10-01
## 8 1.7 2003-04-16
## 9 1.8 2003-10-08
## 10 1.9 2004-04-12
## 11 2.0 2004-10-04
mean(rdates$Date)## [1] "2002-05-19"
range(rdates$Date)## [1] "2000-02-29" "2004-10-04"
rdates$Date[11] - rdates$Date[1]## Time difference of 1679 days
ydiff <- (b2 - b1) / 365.25
ydiff## Time difference of 26.08624 days
attr(ydiff, "units") <- "years"
ydiff## Time difference of 26.08624 years
# time sequences
seq(as.Date("1976-7-4"), by = "days", length = 10)## [1] "1976-07-04" "1976-07-05" "1976-07-06" "1976-07-07" "1976-07-08"
## [6] "1976-07-09" "1976-07-10" "1976-07-11" "1976-07-12" "1976-07-13"
seq(as.Date("2000-6-1"), to = as.Date("2000-8-1"), by = "2 weeks")## [1] "2000-06-01" "2000-06-15" "2000-06-29" "2000-07-13" "2000-07-27"
# extracting parts of dates
table(format(rdates$Date, "%A"))##
## Friday Monday Thursday Tuesday Wednesday
## 2 3 1 2 3
# converting dates to factors
fdate <- factor(format(rdates$Date, "%Y"))
fdate## [1] 2000 2000 2000 2001 2001 2002 2002 2003 2003 2004 2004
## Levels: 2000 2001 2002 2003 2004
In R, as in most programming languages, there’s a difference between a character string that looks like a date – “2019-06-21” or “June 21, 2019” – and an actual date object with specific methods (class-specific functions) hat only work on dates. A date object can print out as “2019-06-21”, but its behavior will be different from the string version that also prints out as “2019-06-21”. For example, “2019-06-21” + 1 throws an error if “2019-06-21” is a character string, but will return “2019-06-22” for a date. lubridate is yet another tidyverse package, that makes dealing with dates or durations (and intervals) as painless as possible.
There are several helpful functions included in
lubridate to convert columns to dates. For instance if
the column you want to convert is of the form “2012-11-21”, then you
would use the function ymd(), for “year-month-day”. If,
however the column is “2012-21-11”, then you would use
ydm(). There’s a few of these helper functions, and they
can handle a lot of different formats for dates. But you have to be
careful with leap years. When a year is not a leap year, the computation
returns NA. The same goes for months with a different number of days.
The way to solve these issues is to use the special %m+%
infix operator.
fechas <- c("1999/12/31", "2000/01/07", "2005/05/20", "2010/03/25")
# converting strings to dates
fechas <- lubridate::ymd(fechas)
class(fechas)## [1] "Date"
# extracting years
lubridate::year(fechas)## [1] 1999 2000 2005 2010
# extracting months
lubridate::month(fechas)## [1] 12 1 5 3
lubridate::month(fechas, label = TRUE)## [1] Dec Jan May Mar
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
# weekdays
lubridate::wday(fechas)## [1] 6 6 6 5
lubridate::wday(fechas, label = TRUE)## [1] Fri Fri Fri Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
# week
lubridate::week(fechas)## [1] 53 1 20 12
# semester
lubridate::semester(fechas)## [1] 2 1 1 1
# changing system defaults
sys_time_old <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", "Spanish.UTF-8")## [1] ""
lubridate::month(fechas, label = TRUE)## [1] Dec Jan May Mar
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
Sys.setlocale("LC_TIME", sys_time_old)## [1] "C"
# figuring out differences between dates
diff(fechas)## Time differences in days
## [1] 7 1960 1770
difftime(fechas[3], fechas[1], units = "weeks")## Time difference of 281 weeks
# sequence
seq(fechas[1], fechas[2], "day")## [1] "1999-12-31" "2000-01-01" "2000-01-02" "2000-01-03" "2000-01-04"
## [6] "2000-01-05" "2000-01-06" "2000-01-07"
seq(fechas[1], fechas[3], "year")## [1] "1999-12-31" "2000-12-31" "2001-12-31" "2002-12-31" "2003-12-31"
## [6] "2004-12-31"
# rounding dates
lubridate::round_date(fechas, "month")## [1] "1999-12-01" "2000-01-01" "2005-05-01" "2010-03-01"
lubridate::round_date(fechas, "year")## [1] "1999-01-01" "2000-01-01" "2005-01-01" "2010-01-01"
# time
llegada <- lubridate::ymd_hms("2011-06-04 12:25:00", tz = "Europe/Berlin")
salida <- lubridate::ymd_hms("2011-06-05 14:45:00", tz = "Europe/Berlin")
llegada## [1] "2011-06-04 12:25:00 CEST"
salida## [1] "2011-06-05 14:45:00 CEST"
lubridate::minute(llegada)## [1] 25
lubridate::with_tz(llegada, "America/Santiago")## [1] "2011-06-04 06:25:00 -04"
independence <- readRDS("input/independence.rds")
# converting variables to date objects
independence <- independence %>%
mutate(independence_date = lubridate::dmy(independence_date))
df_month <- data.frame(month = 1:12, year = 2000, ta = rnorm(12, 15, 2))
dplyr::mutate(df_month, date = lubridate::make_date(year, month))## month year ta date
## 1 1 2000 14.67087 2000-01-01
## 2 2 2000 15.16204 2000-02-01
## 3 3 2000 16.86329 2000-03-01
## 4 4 2000 14.89729 2000-04-01
## 5 5 2000 17.26322 2000-05-01
## 6 6 2000 15.16804 2000-06-01
## 7 7 2000 16.22075 2000-07-01
## 8 8 2000 16.56661 2000-08-01
## 9 9 2000 15.67249 2000-09-01
## 10 10 2000 11.69383 2000-10-01
## 11 11 2000 12.23488 2000-11-01
## 12 12 2000 16.35191 2000-12-01
# data manipulation with lubridate and dplyr
independence %>%
filter(year(independence_date) <= 1960) %>%
pull(country)## [1] "Liberia" "South Africa"
## [3] "Egypt" "Eritrea"
## [5] "Libya" "Sudan"
## [7] "Tunisia" "Ghana"
## [9] "Guinea" "Cameroon"
## [11] "Togo" "Mali"
## [13] "Madagascar" "Democratic Republic of the Congo"
## [15] "Benin" "Niger"
## [17] "Burkina Faso" "Ivory Coast"
## [19] "Chad" "Central African Republic"
## [21] "Republic of the Congo" "Gabon"
## [23] "Mauritania"
independence %>%
filter(lubridate::month(independence_date) == 12,
lubridate::day(independence_date) == 24) %>%
pull(country)## [1] "Libya"
independence %>%
mutate(today = lubridate::today()) %>%
mutate(independent_since = lubridate::interval(independence_date, today)) %>%
select(country, independent_since)## # A tibble: 54 × 2
## country independent_since
## <chr> <Interval>
## 1 Liberia 1847-07-26 UTC--2022-08-29 UTC
## 2 South Africa 1910-05-31 UTC--2022-08-29 UTC
## 3 Egypt 1922-02-28 UTC--2022-08-29 UTC
## 4 Eritrea 1947-02-10 UTC--2022-08-29 UTC
## 5 Libya 1951-12-24 UTC--2022-08-29 UTC
## 6 Sudan 1956-01-01 UTC--2022-08-29 UTC
## 7 Tunisia 1956-03-20 UTC--2022-08-29 UTC
## 8 Morocco NA--NA
## 9 Ghana 1957-03-06 UTC--2022-08-29 UTC
## 10 Guinea 1958-10-02 UTC--2022-08-29 UTC
## # … with 44 more rows
independence %>%
mutate(today = lubridate::today()) %>%
mutate(independent_since = lubridate::interval(independence_date, today)) %>%
select(country, independent_since) %>%
mutate(years_independent = as.numeric(independent_since, "years"))## # A tibble: 54 × 3
## country independent_since years_independent
## <chr> <Interval> <dbl>
## 1 Liberia 1847-07-26 UTC--2022-08-29 UTC 175.
## 2 South Africa 1910-05-31 UTC--2022-08-29 UTC 112.
## 3 Egypt 1922-02-28 UTC--2022-08-29 UTC 100.
## 4 Eritrea 1947-02-10 UTC--2022-08-29 UTC 75.5
## 5 Libya 1951-12-24 UTC--2022-08-29 UTC 70.7
## 6 Sudan 1956-01-01 UTC--2022-08-29 UTC 66.7
## 7 Tunisia 1956-03-20 UTC--2022-08-29 UTC 66.4
## 8 Morocco NA--NA NA
## 9 Ghana 1957-03-06 UTC--2022-08-29 UTC 65.5
## 10 Guinea 1958-10-02 UTC--2022-08-29 UTC 63.9
## # … with 44 more rows
independence %>%
filter(colonial_power %in% c("Belgium", "France", "Portugal", "United Kingdom")) %>%
mutate(today = lubridate::today()) %>%
mutate(independent_since = lubridate::interval(independence_date, today)) %>%
mutate(years_independent = as.numeric(independent_since, "years")) %>%
group_by(colonial_power) %>%
dplyr::summarise(last_colony_independent_for = min(years_independent, na.rm = TRUE))## # A tibble: 4 × 2
## colonial_power last_colony_independent_for
## <chr> <dbl>
## 1 Belgium 60.2
## 2 France 45.2
## 3 Portugal 46.8
## 4 United Kingdom 46.2
# arithmetic with dates
lubridate::ymd("2018-12-31") + 16## [1] "2019-01-16"
lubridate::ymd("2018-12-31") + lubridate::days(16)## [1] "2019-01-16"
lubridate::ymd("2018-12-31") + lubridate::years(1)## [1] "2019-12-31"
# leap years
library("lubridate")
lubridate::ymd("2016-02-29") + lubridate::years(1)## [1] NA
ymd("2016-02-29") %m+% years(1)## [1] "2017-02-28"
ymd("2018-12-31") %m+% months(2)## [1] "2019-02-28"
R provides a wide array of functions to aid in aggregating data. For
simple tabulation and cross-tabulation, the table function
is available. For more complex tasks, the available functions can be
broken down into two groups: those that are designed to work effectively
with arrays and/or lists, like apply, sweep,
mapply, sapply, and lapply, and
those that are oriented toward data frames (like aggregate
and by). There is considerable overlap between the two
tools, and the output of one can be converted to the equivalent of the
output from another, so often the choice of an appropriate function is a
matter of personal taste.
The arguments to the table function can either be
individual vectors representing the levels of interest, or a list or
data frame composed of such vectors. The result from table will
always be an array of as many dimensions as the number of vectors being
tabulated, with dimnames extracted from the levels
of the cross-tabulated variables. By default, table will
not include missing values in its output; to override this, use the
exclude=NULL argument. When passed a single vector of
values, table returns an object of class table, which can be treated as
a named vector. For simple queries regarding individual levels of a
tabulated variable, this may be the most convenient form of displaying
and storing the values. Alternatively, the output from
table can be converted to a data frame using
as.data.frame. When multiple vectors are passed to
table, an array of as many dimensions as there are vectors
is returned.
When passed a data frame, table treats each column as a
separate variable, resulting in a table that effectively counts how
often each row appears in the data frame. This can be especially useful
when the result of table is passed to as.data.frame, since
its form will be similar to the input data frame. Since the data frame
was formed from a table, all possible combinations,
including those with no observations, are included.
Sometimes it is helpful to display the margins of a
table, that is, the sum of each row and/or column, in order to
understand differences among the levels of the variables from which the
table was formed. The addmargins function accepts a table
and returns a similar table, with the requested margins added. To
specify which dimensions should have margins added, the
margin= argument accepts a vector of dimensions; a value of
1 in this vector means a new row with the margins for the columns will
be added, and a value of 2 corresponds to a new column containing row
margins. The default operation to create the margins is to use the
sum function. If some other function is desired, it can be
specified through the FUN= argument. When a margin is
added, the dimnames for the table are adjusted to include a
description of the margin.
When it’s desired to have a table of proportions instead of counts,
one strategy would be to use the sweep function dividing
each row and column by its corresponding margin. The
prop.table function provides a convenient wrapper around
this operation. prop.table accepts a table, and a margin=
argument, and returns a table of proportions. With no value specified
for margin=, the sum of all the cells in the table will be
1; with margin=1, each row of the resulting table will add
to 1, and with margin=2, each column will add to 1.
For tables with more than two dimensions, it may be useful to present
the table in a “flattened” form using the ftable function.
The xtabs function can produce similar results to the
table function, but uses the formula language interface. If
a variable is given on the left-hand side of the tilde (~),
it is interpreted as a vector of counts corresponding to the values of
the variables on the right-hand side, making it very easy to convert
already tabulated data into R’s notion of a table.
When confronted with an aggregation problem, there are three main considerations:
Thinking about these issues will help to point you to the most effective solution for your needs. The following paragraphs should help you make the best choice.
Groups defined as list elements. If the groups you’re
interested in are already organized as elements of a list, then
sapply or lapply are the appropriate
functions; they differ in that lapply always returns a
list, while sapply may simplify its output into a vector or
array if appropriate. This is a very flexible approach, since the entire
data frame for each group is available. Sometimes, if other methods are
inappropriate, you can first use the split function to create a suitable
list for use with sapply or lapply.
Groups defined by rows or columns of a matrix. When the goal
is to operate on each column or row of a matrix, the apply
function is the logical choice. apply will usually return
its results as a vector or array, but will return a list if the results
of operating on the rows or columns are of different dimensions.
Groups based on one or more grouping variables. A wide array
of choices is available for the very common task of operating on subsets
of data based on the value of a grouping variable. If the computations
you desire each involve only a single vector and produce a single scalar
as a result (like calculating a scalar-valued statistic for a variable
or set of variables), the aggregate function is the most
likely choice. Since aggregate always returns a data frame,
it is especially useful if the desired result is to create a plot or fit
a statistical model to the aggregated data.
If your computations involve a single vector, but the result is a
vector (for example, a set of quantiles or a vector of different
statistics), tapply is one available option. Unlike
aggregate, tapply returns its results in a
vector or array for which individual elements are easy to access but may
produce a difficult-to-interpret display for complex problems. Another
approach to the problem is provided by the reshape
package, available through CRAN, It uses a formula interface, and can
produce output in a variety of forms. When the desired result requires
access to more than one variable at a time (for example, calculating a
correlation matrix, or creating a scatter plot), row indices can be
passed to tapply to extract the appropriate rows
corresponding to each group. Alternatively, the by function
can be used. Unlike tapply, the special list returned by by
has a print method which will always produce an easily-readable display
of the aggregation, but accessing individual elements of the returned
list may be inconvenient. Naturally, for tasks like plotting, there is
no clear reason to choose one approach over the other.
As mentioned previously, using split and sapply/lapply
is a good solution if you find that other methods don’t provide the
flexibility you need. Finally, if nothing else seems to work, you can
write a loop to iterate over the values returned by unique or
intersection, and perform whatever operations you desire. If you take
this route, make sure to consider the issues about memory management in
loops.
Although most functions in R will automatically operate on each
element of a vector, the same is not true for lists. Since many R
functions return lists, it’s often useful to process each list element
in the same way that R naturally does for vectors. To handle situations
like this, R provides two functions: lapply and
sapply. Each of these functions takes a list or vector as
its first argument, and a function to be applied to each element as its
second argument. The difference between the two functions is that
lapply will always return its result as a list, while
sapply will simplify its output to a vector or matrix if
possible. Another important use of sapply relates to data
frames. When treated as a list, each column of a data frame retains its
mode and class. To get this information from a data frame
sapply can be used. When the structure of the data would be
lost if sapply tried to simplify it into a vector or array.
This same idea can be used to extract columns of a data frame that meet
a particular condition. sapply or lapply can
be used as an alternative to loops for performing repetitive tasks. When
you use these functions, they take care of the details of deciding on
the appropriate form of the output, and eliminate the need to
incrementally build up a vector or matrix to store the result.
When your data has the added organization of an array, R provides a
convenient way to operate on each dimension of the data through the
apply function. This function requires three arguments: the
array on which to perform the operation, an index telling apply which
dimension to operate on, and the function to use. Like
sapply, additional arguments to the function can be placed
at the end of the argument list. For matrices, a second argument of 1
means “operate on the rows”, and 2 means “operate on the columns”. One
common use of apply is in conjunction with functions like
scale, which require summary statistics calculated for each
column of a matrix. Without additional arguments, the scale
function will subtract the mean of each column and divide by the
standard deviation, resulting in a matrix of zscores. To use other
statistics, appropriate vectors of values can be calculated using
apply and provided to scale using the center=
and scale= arguments. Similar to sapply,
apply will try to return its results in a vector or matrix
when appropriate, making it useful in cases where several quantities
need to be calculated for each row or column of a matrix.
apply will use names that are present in the input matrix
or data frame to properly label the result that it returns. If a vector
needs to be processed in non-overlapping groups, it is sometimes easiest
to temporarily treat the vector as a matrix, and use apply
to operate on the groups.
The apply function is very general, and for certain
applications, there may be more efficient methods available to perform
the necessary computations. For example, if the statistic to be
calculated is the sum or the mean, matrix computations will be more
efficient than calling apply with the appropriate function.
In cases like this, the rowSums, colSums,
rowMeans, or functions can be used. Each of these functions
accepts a matrix (or a data frame which will be coerced to a matrix),
and an optional na.rm= argument to specify the handling of
missing values. Since these functions will accept logical values as
input as well as numeric values, they can be very useful for counting
operations.
A common situation when processing a matrix by rows or columns is
that each row or column needs to be processed differently, based on the
values of an auxiliary vector which already exists. In cases like this,
the sweep function can be used. Like apply,
the first two arguments to sweep are the matrix to be operated on and
the index of the dimension to be used for repetitive processing. In
addition, sweep takes a third argument representing the
vector to be used when processing each column, and finally a fourth
argument providing the function to be used. sweep operates
by building matrices which can be operated on in a single call, so,
unlike apply, only functions which can operate on arrays of
values can be passed to sweep. All of the built-in binary
operators, such as addition (“+”), subtraction
(“-”), multiplication (“*”), and division
(“/”) can be used, but, in general, it will be necessary to
make sure an arbitrary function will work properly with
sweep.
To calculate scalar data summaries of one or more columns of a data
frame or matrix, the aggregate function can be used.
Although this function is limited to returning scalar values, it can
operate on multiple columns of its input argument, making it a natural
choice for data summaries for multiple variables. The first argument to
aggregate is a data frame or matrix containing the
variables to be summarized, the second argument is a list containing the
variables to be used for grouping, and the third argument is the
function to be used to summarize the data. Since the second argument
must be a list, when a data frame is being processed it is often
convenient to refer to the grouping columns using single bracket
subscripts, since columns accessed this way will naturally be in the
form of a list. In addition, with more than one grouping variable,
specifying the columns this way will insure that the grouping variables’
names will be automatically transferred to the output data frame. If the
columns are passed as manually constructed list, aggregate will use
names like Group.1 to identify the grouping variables, unless names are
provided for the list elements.
To process a single vector based on the values of one or more
grouping vectors, the tapply function can also be used. The
returned value from tapply will be an array with as many
dimensions as there were vectors that defined the groups. Unlike
aggregate, tapply is not limited to returning
scalars. To convert values like this to data frames, the
dimnames of the returned object can be combined with the
values. When each element of the vector is of the same length, this
operation is fairly straightforward, but the problem becomes difficult
when the return values are of different lengths. When more than one
grouping variable is used with tapply, and the return value
from the function used is not a scalar, the returned object is somewhat
more difficult to interpret.
The by function generalizes the idea of
tapply to operate on entire data frames broken down by a
list of grouping variables. Thus, the first argument to by
is a data frame, and the remaining arguments are similar to those of
tapply. Each of the rows returned by the by
function is in the form that we would like for a data frame containing
these results, so it would be natural to use rbind to
convert this result to a data frame; however, it is tedious to pass each
row to the rbind function individually. In cases like this,
the do.call function can usually generalize the operation
so that it will be carried out properly regardless of how many elements
need to be processed. do.call takes a list of arguments and
passes them to a function as if they were the argument list for the
function call.
An alternative approach to aggregation is provided by the
reshape package, available from CRAN. The functions in
this package provide a unified approach to aggregation, based on an
extended formula notation. The core idea behind the
reshape package is to create a “melted” version of a
dataset (through the melt function), which can then be
“cast” (with the cast function) into an object with the
desired orientation. To melt a data frame, list, or array into the
appropriate melted form, it is first necessary to divide the variables
into id variables and measure or analysis variables; this should
generally be obvious from the nature of the data. By default, melt
treats factor and integer variables as id variables, and the remaining
variables as analysis variables; if your data is structured according to
this convention, no additional information needs to be provided to melt.
Otherwise, the id.var= or measure.var=
arguments can be used; if you specify one, it will assume all the other
variables are of the other type. Once a dataset is melted, it can be
cast into a variety of forms. Notice that melt displays the
names of variables that have been automatically assigned as id
variables. The basic melting operation preserves the id variables, and
converts the measured variables into two columns named
variable (which identifies which variable is being
measured) and value (which contains the actual values). You
can use a name other than variable by specifying a variable
name= argument to melt. The left-hand side of the formula
passed to cast represents the variable(s) which will appear in the
columns of the result, and the right-hand side describes the variables
which will appear in the rows. Formulas used by cast can include a
single dot (.) to represent an overall summary, or three
dots ... to represent all variables not otherwise included
in the formula. When used for aggregation, an aggregation function
should be supplied; if not it defers to using length. To
limit the variables that are used, we can use the subset=
argument of cast. Since this argument uses the melted data, we need to
refer to the variable named variable. A list of functions can be
provided to cast function. To provide added flexibility,
the vertical bar (|) can be used to cause cast to produce a
list instead of a data frame. The default behavior of cast
is to only include combinations actually encountered in the data. To
include all possible combinations, use the add.missing=TRUE
argument. In each of the preceding examples, the dataset was first
melted, then repeated calls to cast were carried out. If only a single
call to cast is needed, the recast function combines the
melt and cast steps into a single call.
pets <- c("dog", "cat", "duck", "chicken", "duck", "cat", "dog")
tt <- table(pets)
tt## pets
## cat chicken dog duck
## 2 1 2 2
tt["cat"]## cat
## 2
# converting output from table to a data frame
as.data.frame(tt)## pets Freq
## 1 cat 2
## 2 chicken 1
## 3 dog 2
## 4 duck 2
# passing multiple vectors to table
hiinc <- state.x77[, "Income"] > median(state.x77[, "Income"])
stateinc <- table(state.region, hiinc)
stateinc## hiinc
## state.region FALSE TRUE
## Northeast 4 5
## South 12 4
## North Central 5 7
## West 4 9
as.data.frame(stateinc)## state.region hiinc Freq
## 1 Northeast FALSE 4
## 2 South FALSE 12
## 3 North Central FALSE 5
## 4 West FALSE 4
## 5 Northeast TRUE 5
## 6 South TRUE 4
## 7 North Central TRUE 7
## 8 West TRUE 9
# passing a data frame to table and converting it back to a data frame
x <- data.frame(a = c(1, 2, 2, 1, 2, 2, 1), b = c(1, 2, 2, 1, 1, 2, 1), c = c(1, 1, 2, 1, 2, 2, 1))
x## a b c
## 1 1 1 1
## 2 2 2 1
## 3 2 2 2
## 4 1 1 1
## 5 2 1 2
## 6 2 2 2
## 7 1 1 1
as.data.frame(table(x))## a b c Freq
## 1 1 1 1 3
## 2 2 1 1 0
## 3 1 2 1 0
## 4 2 2 1 1
## 5 1 1 2 0
## 6 2 1 2 1
## 7 1 2 2 0
## 8 2 2 2 2
# adding margins
tt <- table(infert$education, infert$parity)
tt##
## 1 2 3 4 5 6
## 0-5yrs 3 0 0 3 0 6
## 6-11yrs 42 42 21 12 3 0
## 12+ yrs 54 39 15 3 3 2
# adding a row of margins
tt1 <- addmargins(tt, 1)
tt1##
## 1 2 3 4 5 6
## 0-5yrs 3 0 0 3 0 6
## 6-11yrs 42 42 21 12 3 0
## 12+ yrs 54 39 15 3 3 2
## Sum 99 81 36 18 6 8
# adding margins to both rows and columns
tt12 <- addmargins(tt, c(1, 2))
tt12##
## 1 2 3 4 5 6 Sum
## 0-5yrs 3 0 0 3 0 6 12
## 6-11yrs 42 42 21 12 3 0 120
## 12+ yrs 54 39 15 3 3 2 116
## Sum 99 81 36 18 6 8 248
# getting proportions
prop.table(tt, 2)##
## 1 2 3 4 5 6
## 0-5yrs 0.03030303 0.00000000 0.00000000 0.16666667 0.00000000 0.75000000
## 6-11yrs 0.42424242 0.51851852 0.58333333 0.66666667 0.50000000 0.00000000
## 12+ yrs 0.54545455 0.48148148 0.41666667 0.16666667 0.50000000 0.25000000
# multidimensional table
ftable(UCBAdmissions)## Dept A B C D E F
## Admit Gender
## Admitted Male 512 353 120 138 53 22
## Female 89 17 202 131 94 24
## Rejected Male 313 207 205 279 138 351
## Female 19 8 391 244 299 317
# using xtabs
xtabs(~ state.region + hiinc)## hiinc
## state.region FALSE TRUE
## Northeast 4 5
## South 12 4
## North Central 5 7
## West 4 9
x <- data.frame(a = c(1, 2, 2, 1, 2, 2, 1), b = c(1, 2, 2, 1, 1, 2, 1), c = c(1, 1, 2, 1, 2, 2, 1))
dfx <- as.data.frame(table(x))
xtabs(Freq ~ a + b + c, data = dfx)## , , c = 1
##
## b
## a 1 2
## 1 3 0
## 2 0 1
##
## , , c = 2
##
## b
## a 1 2
## 1 0 0
## 2 1 2
# mapping a Function to a Vector or List
text <- c("R is a free environment for statistical analysis", "It compiles and runs on a variety of platforms", "Visit the R home page for more information")
result <- strsplit(text, " ")
result## [[1]]
## [1] "R" "is" "a" "free" "environment"
## [6] "for" "statistical" "analysis"
##
## [[2]]
## [1] "It" "compiles" "and" "runs" "on" "a"
## [7] "variety" "of" "platforms"
##
## [[3]]
## [1] "Visit" "the" "R" "home" "page"
## [6] "for" "more" "information"
# reports the number of elements in the returned list (3)
length(result)## [1] 3
# finding out the length of the individual elements of the list (words)
nwords <- sapply(result, length)
nwords## [1] 8 9 8
# getting mode and class from a data frame with sapply()
class(ChickWeight)## [1] "nfnGroupedData" "nfGroupedData" "groupedData" "data.frame"
sapply(ChickWeight, class)## $weight
## [1] "numeric"
##
## $Time
## [1] "numeric"
##
## $Chick
## [1] "ordered" "factor"
##
## $Diet
## [1] "factor"
# extracting columns of a data frame that meet a particular condition (numeric) with sapply()
df <- ChickWeight[, sapply(ChickWeight, class) == "numeric"]
df## weight Time
## 1 42 0
## 2 51 2
## 3 59 4
## 4 64 6
## 5 76 8
## 6 93 10
## 7 106 12
## 8 125 14
## 9 149 16
## 10 171 18
## 11 199 20
## 12 205 21
## 13 40 0
## 14 49 2
## 15 58 4
## 16 72 6
## 17 84 8
## 18 103 10
## 19 122 12
## 20 138 14
## 21 162 16
## 22 187 18
## 23 209 20
## 24 215 21
## 25 43 0
## 26 39 2
## 27 55 4
## 28 67 6
## 29 84 8
## 30 99 10
## 31 115 12
## 32 138 14
## 33 163 16
## 34 187 18
## 35 198 20
## 36 202 21
## 37 42 0
## 38 49 2
## 39 56 4
## 40 67 6
## 41 74 8
## 42 87 10
## 43 102 12
## 44 108 14
## 45 136 16
## 46 154 18
## 47 160 20
## 48 157 21
## 49 41 0
## 50 42 2
## 51 48 4
## 52 60 6
## 53 79 8
## 54 106 10
## 55 141 12
## 56 164 14
## 57 197 16
## 58 199 18
## 59 220 20
## 60 223 21
## 61 41 0
## 62 49 2
## 63 59 4
## 64 74 6
## 65 97 8
## 66 124 10
## 67 141 12
## 68 148 14
## 69 155 16
## 70 160 18
## 71 160 20
## 72 157 21
## 73 41 0
## 74 49 2
## 75 57 4
## 76 71 6
## 77 89 8
## 78 112 10
## 79 146 12
## 80 174 14
## 81 218 16
## 82 250 18
## 83 288 20
## 84 305 21
## 85 42 0
## 86 50 2
## 87 61 4
## 88 71 6
## 89 84 8
## 90 93 10
## 91 110 12
## 92 116 14
## 93 126 16
## 94 134 18
## 95 125 20
## 96 42 0
## 97 51 2
## 98 59 4
## 99 68 6
## 100 85 8
## 101 96 10
## 102 90 12
## 103 92 14
## 104 93 16
## 105 100 18
## 106 100 20
## 107 98 21
## 108 41 0
## 109 44 2
## 110 52 4
## 111 63 6
## 112 74 8
## 113 81 10
## 114 89 12
## 115 96 14
## 116 101 16
## 117 112 18
## 118 120 20
## 119 124 21
## 120 43 0
## 121 51 2
## 122 63 4
## 123 84 6
## 124 112 8
## 125 139 10
## 126 168 12
## 127 177 14
## 128 182 16
## 129 184 18
## 130 181 20
## 131 175 21
## 132 41 0
## 133 49 2
## 134 56 4
## 135 62 6
## 136 72 8
## 137 88 10
## 138 119 12
## 139 135 14
## 140 162 16
## 141 185 18
## 142 195 20
## 143 205 21
## 144 41 0
## 145 48 2
## 146 53 4
## 147 60 6
## 148 65 8
## 149 67 10
## 150 71 12
## 151 70 14
## 152 71 16
## 153 81 18
## 154 91 20
## 155 96 21
## 156 41 0
## 157 49 2
## 158 62 4
## 159 79 6
## 160 101 8
## 161 128 10
## 162 164 12
## 163 192 14
## 164 227 16
## 165 248 18
## 166 259 20
## 167 266 21
## 168 41 0
## 169 49 2
## 170 56 4
## 171 64 6
## 172 68 8
## 173 68 10
## 174 67 12
## 175 68 14
## 176 41 0
## 177 45 2
## 178 49 4
## 179 51 6
## 180 57 8
## 181 51 10
## 182 54 12
## 183 42 0
## 184 51 2
## 185 61 4
## 186 72 6
## 187 83 8
## 188 89 10
## 189 98 12
## 190 103 14
## 191 113 16
## 192 123 18
## 193 133 20
## 194 142 21
## 195 39 0
## 196 35 2
## 197 43 0
## 198 48 2
## 199 55 4
## 200 62 6
## 201 65 8
## 202 71 10
## 203 82 12
## 204 88 14
## 205 106 16
## 206 120 18
## 207 144 20
## 208 157 21
## 209 41 0
## 210 47 2
## 211 54 4
## 212 58 6
## 213 65 8
## 214 73 10
## 215 77 12
## 216 89 14
## 217 98 16
## 218 107 18
## 219 115 20
## 220 117 21
## 221 40 0
## 222 50 2
## 223 62 4
## 224 86 6
## 225 125 8
## 226 163 10
## 227 217 12
## 228 240 14
## 229 275 16
## 230 307 18
## 231 318 20
## 232 331 21
## 233 41 0
## 234 55 2
## 235 64 4
## 236 77 6
## 237 90 8
## 238 95 10
## 239 108 12
## 240 111 14
## 241 131 16
## 242 148 18
## 243 164 20
## 244 167 21
## 245 43 0
## 246 52 2
## 247 61 4
## 248 73 6
## 249 90 8
## 250 103 10
## 251 127 12
## 252 135 14
## 253 145 16
## 254 163 18
## 255 170 20
## 256 175 21
## 257 42 0
## 258 52 2
## 259 58 4
## 260 74 6
## 261 66 8
## 262 68 10
## 263 70 12
## 264 71 14
## 265 72 16
## 266 72 18
## 267 76 20
## 268 74 21
## 269 40 0
## 270 49 2
## 271 62 4
## 272 78 6
## 273 102 8
## 274 124 10
## 275 146 12
## 276 164 14
## 277 197 16
## 278 231 18
## 279 259 20
## 280 265 21
## 281 42 0
## 282 48 2
## 283 57 4
## 284 74 6
## 285 93 8
## 286 114 10
## 287 136 12
## 288 147 14
## 289 169 16
## 290 205 18
## 291 236 20
## 292 251 21
## 293 39 0
## 294 46 2
## 295 58 4
## 296 73 6
## 297 87 8
## 298 100 10
## 299 115 12
## 300 123 14
## 301 144 16
## 302 163 18
## 303 185 20
## 304 192 21
## 305 39 0
## 306 46 2
## 307 58 4
## 308 73 6
## 309 92 8
## 310 114 10
## 311 145 12
## 312 156 14
## 313 184 16
## 314 207 18
## 315 212 20
## 316 233 21
## 317 39 0
## 318 48 2
## 319 59 4
## 320 74 6
## 321 87 8
## 322 106 10
## 323 134 12
## 324 150 14
## 325 187 16
## 326 230 18
## 327 279 20
## 328 309 21
## 329 42 0
## 330 48 2
## 331 59 4
## 332 72 6
## 333 85 8
## 334 98 10
## 335 115 12
## 336 122 14
## 337 143 16
## 338 151 18
## 339 157 20
## 340 150 21
## 341 42 0
## 342 53 2
## 343 62 4
## 344 73 6
## 345 85 8
## 346 102 10
## 347 123 12
## 348 138 14
## 349 170 16
## 350 204 18
## 351 235 20
## 352 256 21
## 353 41 0
## 354 49 2
## 355 65 4
## 356 82 6
## 357 107 8
## 358 129 10
## 359 159 12
## 360 179 14
## 361 221 16
## 362 263 18
## 363 291 20
## 364 305 21
## 365 39 0
## 366 50 2
## 367 63 4
## 368 77 6
## 369 96 8
## 370 111 10
## 371 137 12
## 372 144 14
## 373 151 16
## 374 146 18
## 375 156 20
## 376 147 21
## 377 41 0
## 378 49 2
## 379 63 4
## 380 85 6
## 381 107 8
## 382 134 10
## 383 164 12
## 384 186 14
## 385 235 16
## 386 294 18
## 387 327 20
## 388 341 21
## 389 41 0
## 390 53 2
## 391 64 4
## 392 87 6
## 393 123 8
## 394 158 10
## 395 201 12
## 396 238 14
## 397 287 16
## 398 332 18
## 399 361 20
## 400 373 21
## 401 39 0
## 402 48 2
## 403 61 4
## 404 76 6
## 405 98 8
## 406 116 10
## 407 145 12
## 408 166 14
## 409 198 16
## 410 227 18
## 411 225 20
## 412 220 21
## 413 41 0
## 414 48 2
## 415 56 4
## 416 68 6
## 417 80 8
## 418 83 10
## 419 103 12
## 420 112 14
## 421 135 16
## 422 157 18
## 423 169 20
## 424 178 21
## 425 41 0
## 426 49 2
## 427 61 4
## 428 74 6
## 429 98 8
## 430 109 10
## 431 128 12
## 432 154 14
## 433 192 16
## 434 232 18
## 435 280 20
## 436 290 21
## 437 42 0
## 438 50 2
## 439 61 4
## 440 78 6
## 441 89 8
## 442 109 10
## 443 130 12
## 444 146 14
## 445 170 16
## 446 214 18
## 447 250 20
## 448 272 21
## 449 41 0
## 450 55 2
## 451 66 4
## 452 79 6
## 453 101 8
## 454 120 10
## 455 154 12
## 456 182 14
## 457 215 16
## 458 262 18
## 459 295 20
## 460 321 21
## 461 42 0
## 462 51 2
## 463 66 4
## 464 85 6
## 465 103 8
## 466 124 10
## 467 155 12
## 468 153 14
## 469 175 16
## 470 184 18
## 471 199 20
## 472 204 21
## 473 42 0
## 474 49 2
## 475 63 4
## 476 84 6
## 477 103 8
## 478 126 10
## 479 160 12
## 480 174 14
## 481 204 16
## 482 234 18
## 483 269 20
## 484 281 21
## 485 42 0
## 486 55 2
## 487 69 4
## 488 96 6
## 489 131 8
## 490 157 10
## 491 184 12
## 492 188 14
## 493 197 16
## 494 198 18
## 495 199 20
## 496 200 21
## 497 42 0
## 498 51 2
## 499 65 4
## 500 86 6
## 501 103 8
## 502 118 10
## 503 127 12
## 504 138 14
## 505 145 16
## 506 146 18
## 507 41 0
## 508 50 2
## 509 61 4
## 510 78 6
## 511 98 8
## 512 117 10
## 513 135 12
## 514 141 14
## 515 147 16
## 516 174 18
## 517 197 20
## 518 196 21
## 519 40 0
## 520 52 2
## 521 62 4
## 522 82 6
## 523 101 8
## 524 120 10
## 525 144 12
## 526 156 14
## 527 173 16
## 528 210 18
## 529 231 20
## 530 238 21
## 531 41 0
## 532 53 2
## 533 66 4
## 534 79 6
## 535 100 8
## 536 123 10
## 537 148 12
## 538 157 14
## 539 168 16
## 540 185 18
## 541 210 20
## 542 205 21
## 543 39 0
## 544 50 2
## 545 62 4
## 546 80 6
## 547 104 8
## 548 125 10
## 549 154 12
## 550 170 14
## 551 222 16
## 552 261 18
## 553 303 20
## 554 322 21
## 555 40 0
## 556 53 2
## 557 64 4
## 558 85 6
## 559 108 8
## 560 128 10
## 561 152 12
## 562 166 14
## 563 184 16
## 564 203 18
## 565 233 20
## 566 237 21
## 567 41 0
## 568 54 2
## 569 67 4
## 570 84 6
## 571 105 8
## 572 122 10
## 573 155 12
## 574 175 14
## 575 205 16
## 576 234 18
## 577 264 20
## 578 264 21
# a more complex example of using sapply()
maxcor <- function(i, n = 10, m = 5) { # since sapply will always pass an argument to the applied function, a dummy argument (i) is added to the function.
mat <- matrix(rnorm(n * m), n, m)
corr <- cor(mat)
diag(corr) <- NA
max(corr, na.rm = TRUE)
}
maxcors <- sapply(1:1000, maxcor, n = 100)
mean(maxcors)## [1] 0.156293
# using apply()
sstate <- scale(state.x77, center = apply(state.x77, 2, median), scale = apply(state.x77, 2, mad))
sstate## Population Income Illiteracy Life Exp Murder
## Alabama 0.268654565 -1.53997252 2.21618392 -1.053891812 1.589871076
## Alaska -0.855785018 3.09026889 1.05991405 -0.885269122 0.857566823
## Arizona -0.216757354 0.01892704 1.63804899 -0.081068601 0.183076063
## Arkansas -0.252047457 -1.96324989 1.83076063 -0.009728232 0.626312848
## California 6.352045703 1.02378062 0.28906747 0.671248015 0.664855177
## Colorado -0.102929470 0.62803349 -0.48177911 0.898240098 -0.009635582
## Connecticut 0.090474139 1.42641031 0.28906747 1.170630597 -0.722668671
## Delaware -0.781744997 0.49898551 -0.09635582 -0.398857516 -0.125262570
## Florida 1.881619900 0.50930935 0.67449076 -0.009728232 0.741939835
## Georgia 0.723966101 -0.73643379 2.02347228 -1.384651703 1.358617101
## Hawaii -0.681756369 0.76396402 1.83076063 1.897005261 -0.125262570
## Idaho -0.700785347 -0.68825588 -0.67449076 0.775015825 -0.298703051
## Illinois 2.891885618 1.01173614 -0.09635582 -0.346973612 0.664855177
## Indiana 0.856131000 -0.10495902 -0.48177911 0.132952505 0.048177911
## Iowa 0.007784582 0.18754973 -0.86720241 1.222514502 -0.876837987
## Kansas -0.193230618 0.25809595 -0.67449076 1.235485478 -0.452872367
## Kentucky 0.189770804 -1.38855623 1.25262570 -0.372915564 0.722668671
## Louisiana 0.334737015 -1.67590306 3.56516544 -1.241970966 1.223718949
## Maine -0.616019901 -1.41952775 -0.48177911 -0.184836410 -0.799753329
## Maryland 0.444067140 1.34209896 -0.09635582 -0.295089707 0.317974215
## Massachusetts 1.029467686 0.40607097 0.28906747 0.749073872 -0.684126342
## Michigan 2.170168396 0.39918841 -0.09635582 -0.029184696 0.819024494
## Minnesota 0.374524877 0.26841979 -0.67449076 1.481934024 -0.876837987
## Mississippi -0.172125752 -2.44502900 2.79431886 -1.676498667 1.088820797
## Missouri 0.667225150 -0.45596952 -0.28906747 0.009728232 0.472143532
## Montana -0.723966101 -0.29595003 -0.67449076 -0.074583113 -0.356516544
## Nebraska -0.447872936 -0.01892704 -0.67449076 1.248456454 -0.761211000
## Nevada -0.777939201 1.08400301 -0.86720241 -1.066862788 0.896109152
## New Hampshire -0.701131328 -0.40951225 -0.48177911 0.359944588 -0.684126342
## New Jersey 1.555013449 1.23541930 0.28906747 0.165379946 -0.317974215
## New Mexico -0.586265500 -1.57954724 2.40889557 -0.230234827 0.549228190
## New York 5.271891740 0.66072564 0.86720241 -0.081068601 0.780482165
## North Carolina 0.900416620 -1.10809196 1.63804899 -0.950124003 0.819024494
## North Dakota -0.761678075 0.97732335 -0.28906747 1.365195239 -1.050278468
## Ohio 2.732042207 0.07226687 -0.28906747 0.094039577 0.105991405
## Oklahoma -0.042728704 -0.92226288 0.28906747 0.483168861 -0.086720241
## Oregon -0.191846692 0.24261020 -0.67449076 0.943638514 -0.510685861
## Pennsylvania 3.121271293 -0.12044478 0.09635582 -0.158894458 -0.144533734
## Rhode Island -0.659959540 0.06710495 0.67449076 0.794472289 -0.857566823
## South Carolina -0.007784582 -1.52104549 2.60160722 -1.760810012 0.915380316
## South Dakota -0.746454893 -0.60566517 -0.86720241 0.911211074 -0.992464975
## Tennessee 0.461712192 -1.20100651 1.44533734 -0.366430076 0.799753329
## Texas 3.251706285 -0.56953174 2.40889557 0.145923482 1.031007304
## Utah -0.565852597 -0.85515793 -0.67449076 1.443021096 -0.452872367
## Vermont -0.818765008 -1.05303149 -0.67449076 0.625849599 -0.260160722
## Virginia 0.741265172 0.31315642 0.86720241 -0.385886540 0.510685861
## Washington 0.249279606 0.59362069 -0.67449076 0.677733504 -0.491414696
## West Virginia -0.359647676 -1.55201700 0.86720241 -0.775015825 -0.028906747
## Wisconsin 0.605640459 -0.08775262 -0.48177911 1.170630597 -0.741939835
## Wyoming -0.851979223 0.08087007 -0.67449076 -0.249691291 0.009635582
## HS Grad Frost Area
## Alabama -1.389683548 -1.770538244 -0.101552765
## Alaska 1.564120813 0.702594541 14.572921383
## Arizona 0.564013825 -1.864217516 1.682776836
## Arkansas -1.552491662 -0.927424794 -0.066355015
## California 1.087325621 -1.770538244 2.904710696
## Colorado 1.238504584 0.964896503 1.408166095
## Connecticut 0.319801653 0.459028434 -1.406060490
## Delaware 0.156993539 -0.215462326 -1.488008364
## Florida -0.075589482 -1.939160933 -0.005320921
## Georgia -1.471087605 -1.021104066 0.108011851
## Hawaii 1.005921564 -2.145255332 -1.361586695
## Idaho 0.726821939 0.215462326 0.808097094
## Illinois -0.075589482 0.234198180 0.041856015
## Indiana -0.040702029 0.140518908 -0.517295957
## Iowa 0.668676184 0.477764288 0.047347661
## Kansas 0.773338543 -0.009367927 0.782772925
## Kentucky -1.715299776 -0.365349161 -0.416198458
## Louisiana -1.285021188 -1.920425079 -0.265960688
## Maine 0.168622690 0.871217231 -0.664602952
## Maryland -0.110476935 -0.252934035 -1.262964705
## Massachusetts 0.610530429 -0.215462326 -1.321722469
## Michigan -0.052331180 0.196726472 0.072273473
## Minnesota 0.505868070 0.852481377 0.711694525
## Mississippi -1.424571001 -1.208462611 -0.198638233
## Missouri -0.517497221 -0.121783054 0.418787783
## Montana 0.691934486 0.758802104 2.598145974
## Nebraska 0.703563637 0.459028434 0.631852256
## Nevada 1.389683548 1.377085301 1.582390690
## New Hampshire 0.505868070 1.114783339 -1.287549067
## New Jersey -0.087218633 0.009367927 -1.330400977
## New Mexico 0.226768445 0.103047199 1.910267550
## New York -0.063960331 -0.608915269 -0.183415277
## North Carolina -1.715299776 -0.646386978 -0.155900140
## North Dakota -0.343059955 1.339613592 0.426698029
## Ohio -0.005814576 0.177990617 -0.378496745
## Oklahoma -0.191880992 -0.608915269 0.412727055
## Oregon 0.784967694 -1.320877737 1.192426934
## Pennsylvania -0.354689106 0.215462326 -0.264936340
## Rhode Island -0.796596845 0.234198180 -1.514556061
## South Carolina -1.796703833 -0.927424794 -0.684378567
## South Dakota 0.005814576 1.077311630 0.616828479
## Tennessee -1.331537792 -0.833745522 -0.368452439
## Texas -0.680305335 -1.489500427 5.914388652
## Utah 1.633895719 0.421556725 0.791565249
## Vermont 0.447722314 1.002368212 -1.280720078
## Virginia -0.633788731 -0.552707706 -0.412499422
## Washington 1.191987980 -1.545707990 0.349786534
## West Virginia -1.354796094 -0.271669889 -0.859513695
## Wisconsin 0.145364388 0.646386978 0.005320921
## Wyoming 1.122213074 1.096047484 1.221421685
## attr(,"scaled:center")
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 2838.500 4519.000 0.950 70.675 6.850 53.250 114.500
## Area
## 54277.000
## attr(,"scaled:scale")
## Population Income Illiteracy Life Exp Murder HS Grad
## 2890.328700 581.179200 0.518910 1.541904 5.189100 8.599080
## Frost Area
## 53.373600 35144.291700
# another example
summfun <- function(x) c(n = sum(!is.na(x)), mean = mean(x), sd = sd(x))
x <- apply(state.x77, 2, summfun)
t(x)## n mean sd
## Population 50 4246.4200 4464.4914334
## Income 50 4435.8000 614.4699392
## Illiteracy 50 1.1700 0.6095331
## Life Exp 50 70.8786 1.3423936
## Murder 50 7.3780 3.6915397
## HS Grad 50 53.1080 8.0769978
## Frost 50 104.4600 51.9808481
## Area 50 70735.8800 85327.2996224
# apply() for non-overlapping groups
x <- 1:12
apply(matrix(x, ncol = 3, byrow = TRUE), 1, sum) # first need to be converted into a matrix## [1] 6 15 24 33
# specific statistics calculation without apply()
mns <- colMeans(USJudgeRatings)
mns## CONT INTG DMNR DILG CFMG DECI PREP FAMI
## 7.437209 8.020930 7.516279 7.693023 7.479070 7.565116 7.467442 7.488372
## ORAL WRIT PHYS RTEN
## 7.293023 7.383721 7.934884 7.602326
jscore <- rowSums(USJudgeRatings >= 8)
head(jscore)## AARONSON,L.H. ALEXANDER,J.M. ARMENTANO,A.J. BERDON,R.I. BRACKEN,J.J.
## 1 8 1 11 0
## BURNS,E.B.
## 10
# using sweep()
maxes <- apply(state.x77, 2, max)
swept <- sweep(state.x77, 2, maxes, "/")
head(swept)## Population Income Illiteracy Life Exp Murder HS Grad
## Alabama 0.17053496 0.5738717 0.7500000 0.9381793 1.0000000 0.6136701
## Alaska 0.01721861 1.0000000 0.5357143 0.9417120 0.7483444 0.9910847
## Arizona 0.10434947 0.7173397 0.6428571 0.9585598 0.5165563 0.8632987
## Arkansas 0.09953769 0.5349169 0.6785714 0.9600543 0.6688742 0.5928678
## California 1.00000000 0.8098179 0.3928571 0.9743207 0.6821192 0.9301634
## Colorado 0.11986980 0.7733967 0.2500000 0.9790761 0.4503311 0.9494799
## Frost Area
## Alabama 0.10638298 0.08952178
## Alaska 0.80851064 1.00000000
## Arizona 0.07978723 0.20023057
## Arkansas 0.34574468 0.09170562
## California 0.10638298 0.27604549
## Colorado 0.88297872 0.18319233
# non-working sweep() example
# right way to proceed
meds <- apply(state.x77, 2, median)
meanmed <- function(var, med) mean(var[var > med])
meanmed(state.x77[, 1], meds[1]) # for every col## [1] 7136.16
meanmed(state.x77[, 2], meds[2])## [1] 4917.92
# opposite to using sweep() -> doesn´t work! returns only a single value
sweep(state.x77, 2, meds, meanmed)## [1] 15569.75
# solution via mapply()
mapply(meanmed, as.data.frame(state.x77), meds)## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 7136.160 4917.920 1.660 71.950 10.544 59.524 146.840
## Area
## 112213.400
# mapping a function based on groups
aggregate(iris[-5], iris[5], mean)## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
weights <- aggregate(ChickWeight$weight, ChickWeight[c("Time", "Diet")], mean)
# or
# weights <- aggregate(ChickWeight$weight, list(Time = ChickWeight$Time, Diet = ChickWeight$Diet), mean)
weights## Time Diet x
## 1 0 1 41.40000
## 2 2 1 47.25000
## 3 4 1 56.47368
## 4 6 1 66.78947
## 5 8 1 79.68421
## 6 10 1 93.05263
## 7 12 1 108.52632
## 8 14 1 123.38889
## 9 16 1 144.64706
## 10 18 1 158.94118
## 11 20 1 170.41176
## 12 21 1 177.75000
## 13 0 2 40.70000
## 14 2 2 49.40000
## 15 4 2 59.80000
## 16 6 2 75.40000
## 17 8 2 91.70000
## 18 10 2 108.50000
## 19 12 2 131.30000
## 20 14 2 141.90000
## 21 16 2 164.70000
## 22 18 2 187.70000
## 23 20 2 205.60000
## 24 21 2 214.70000
## 25 0 3 40.80000
## 26 2 3 50.40000
## 27 4 3 62.20000
## 28 6 3 77.90000
## 29 8 3 98.40000
## 30 10 3 117.10000
## 31 12 3 144.40000
## 32 14 3 164.50000
## 33 16 3 197.40000
## 34 18 3 233.10000
## 35 20 3 258.90000
## 36 21 3 270.30000
## 37 0 4 41.00000
## 38 2 4 51.80000
## 39 4 4 64.50000
## 40 6 4 83.90000
## 41 8 4 105.60000
## 42 10 4 126.00000
## 43 12 4 151.40000
## 44 14 4 161.80000
## 45 16 4 182.00000
## 46 18 4 202.90000
## 47 20 4 233.88889
## 48 21 4 238.55556
# a single vector based on the values or one or more grouping vectors using tapply()
maxweight <- tapply(PlantGrowth$weight, PlantGrowth$group, max)
as.data.frame(as.table(maxweight), responseName = "MaxWeight")## Var1 MaxWeight
## 1 ctrl 6.11
## 2 trt1 6.03
## 3 trt2 6.31
ranges <- tapply(PlantGrowth$weight, PlantGrowth$group, range)
ranges## $ctrl
## [1] 4.17 6.11
##
## $trt1
## [1] 3.59 6.03
##
## $trt2
## [1] 4.92 6.31
# converting the results to a data frame
data.frame(group = dimnames(ranges)[[1]], matrix(unlist(ranges), ncol = 2, byrow = TRUE))## group X1 X2
## 1 ctrl 4.17 6.11
## 2 trt1 3.59 6.03
## 3 trt2 4.92 6.31
# using more than one grouping variable with tapply() and the returned value is NOT a scalar
ranges1 <- tapply(CO2$uptake, CO2[c("Type", "Treatment")], range)
data.frame(expand.grid(dimnames(ranges1)), matrix(unlist(ranges1), byrow = TRUE, ncol = 2))## Type Treatment X1 X2
## 1 Quebec nonchilled 13.6 45.5
## 2 Mississippi nonchilled 10.6 35.5
## 3 Quebec chilled 9.3 42.4
## 4 Mississippi chilled 7.7 22.2
# using by()
sumfun <- function(x) data.frame(n = length(x$uptake), mean = mean(x$uptake), sd = sd(x$uptake))
bb <- by(CO2, CO2[c("Type", "Treatment")], sumfun)
bb## Type: Quebec
## Treatment: nonchilled
## n mean sd
## 1 21 35.33333 9.596371
## ------------------------------------------------------------
## Type: Mississippi
## Treatment: nonchilled
## n mean sd
## 1 21 25.95238 7.402136
## ------------------------------------------------------------
## Type: Quebec
## Treatment: chilled
## n mean sd
## 1 21 31.75238 9.644823
## ------------------------------------------------------------
## Type: Mississippi
## Treatment: chilled
## n mean sd
## 1 21 15.81429 4.058976
cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))## Type Treatment n mean sd
## 1 Quebec nonchilled 21 35.33333 9.596371
## 2 Mississippi nonchilled 21 25.95238 7.402136
## 3 Quebec chilled 21 31.75238 9.644823
## 4 Mississippi chilled 21 15.81429 4.058976
states <- data.frame(state.x77, state = row.names(state.x77), region = state.region, row.names = 1:50)
head(states)## Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area state
## 1 3615 3624 2.1 69.05 15.1 41.3 20 50708 Alabama
## 2 365 6315 1.5 69.31 11.3 66.7 152 566432 Alaska
## 3 2212 4530 1.8 70.55 7.8 58.1 15 113417 Arizona
## 4 2110 3378 1.9 70.66 10.1 39.9 65 51945 Arkansas
## 5 21198 5114 1.1 71.71 10.3 62.6 20 156361 California
## 6 2541 4884 0.7 72.06 6.8 63.9 166 103766 Colorado
## region
## 1 South
## 2 West
## 3 West
## 4 South
## 5 West
## 6 West
mstates <- reshape::melt(states)
head(mstates)## state region variable value
## 1 Alabama South Population 3615
## 2 Alaska West Population 365
## 3 Arizona West Population 2212
## 4 Arkansas South Population 2110
## 5 California West Population 21198
## 6 Colorado West Population 2541
reshape::cast(mstates, region ~ variable, mean)## region Population Income Illiteracy Life.Exp Murder HS.Grad
## 1 Northeast 5495.111 4570.222 1.000000 71.26444 4.722222 53.96667
## 2 South 4208.125 4011.938 1.737500 69.70625 10.581250 44.34375
## 3 North Central 4803.000 4611.083 0.700000 71.76667 5.275000 54.51667
## 4 West 2915.308 4702.615 1.023077 71.23462 7.215385 62.00000
## Frost Area
## 1 132.7778 18141.00
## 2 64.6250 54605.12
## 3 138.8333 62652.00
## 4 102.1538 134463.00
reshape::cast(mstates, variable ~ region, mean)## variable Northeast South North Central West
## 1 Population 5495.111111 4208.12500 4803.00000 2915.307692
## 2 Income 4570.222222 4011.93750 4611.08333 4702.615385
## 3 Illiteracy 1.000000 1.73750 0.70000 1.023077
## 4 Life.Exp 71.264444 69.70625 71.76667 71.234615
## 5 Murder 4.722222 10.58125 5.27500 7.215385
## 6 HS.Grad 53.966667 44.34375 54.51667 62.000000
## 7 Frost 132.777778 64.62500 138.83333 102.153846
## 8 Area 18141.000000 54605.12500 62652.00000 134463.000000
reshape::cast(mstates, region ~ variable, mean, subset = variable %in% c("Population", "Life.Exp"))## region Population Life.Exp
## 1 Northeast 5495.111 71.26444
## 2 South 4208.125 69.70625
## 3 North Central 4803.000 71.76667
## 4 West 2915.308 71.23462
reshape::cast(mstates, . ~ variable, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp"))## value Population_mean Population_median Population_sd Life.Exp_mean
## 1 (all) 4246.42 2838.5 4464.491 70.8786
## Life.Exp_median Life.Exp_sd
## 1 70.675 1.342394
# or
# reshape::cast(mstates, variable ~ ., c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp"))
# using a grouping variable
reshape::cast(mstates, region ~ variable, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp")) # data frame## region Population_mean Population_median Population_sd Life.Exp_mean
## 1 Northeast 5495.111 3100.0 6079.565 71.26444
## 2 South 4208.125 3710.5 2779.508 69.70625
## 3 North Central 4803.000 4255.0 3702.828 71.76667
## 4 West 2915.308 1144.0 5578.607 71.23462
## Life.Exp_median Life.Exp_sd
## 1 71.23 0.7438769
## 2 70.07 1.0221994
## 3 72.28 1.0367285
## 4 71.71 1.3519715
reshape::cast(mstates, variable ~ . | region, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp")) # list## $Northeast
## variable mean median sd
## 1 Population 5495.11111 3100.00 6079.5651457
## 2 Life.Exp 71.26444 71.23 0.7438769
##
## $South
## variable mean median sd
## 1 Population 4208.12500 3710.50 2779.508251
## 2 Life.Exp 69.70625 70.07 1.022199
##
## $`North Central`
## variable mean median sd
## 1 Population 4803.00000 4255.00 3702.827593
## 2 Life.Exp 71.76667 72.28 1.036729
##
## $West
## variable mean median sd
## 1 Population 2915.30769 1144.00 5578.607015
## 2 Life.Exp 71.23462 71.71 1.351971
# another example
mChick <- reshape::melt(ChickWeight, measure.var = "weight")
head(reshape::cast(mChick, Diet + Time ~ variable, median))## Diet Time weight
## 1 1 0 41
## 2 1 2 49
## 3 1 4 56
## 4 1 6 67
## 5 1 8 79
## 6 1 10 93
reshape::cast(mChick, Diet ~ Time + variable, mean)## Diet 0_weight 2_weight 4_weight 6_weight 8_weight 10_weight 12_weight
## 1 1 41.4 47.25 56.47368 66.78947 79.68421 93.05263 108.5263
## 2 2 40.7 49.40 59.80000 75.40000 91.70000 108.50000 131.3000
## 3 3 40.8 50.40 62.20000 77.90000 98.40000 117.10000 144.4000
## 4 4 41.0 51.80 64.50000 83.90000 105.60000 126.00000 151.4000
## 14_weight 16_weight 18_weight 20_weight 21_weight
## 1 123.3889 144.6471 158.9412 170.4118 177.7500
## 2 141.9000 164.7000 187.7000 205.6000 214.7000
## 3 164.5000 197.4000 233.1000 258.9000 270.3000
## 4 161.8000 182.0000 202.9000 233.8889 238.5556
reshape::cast(mChick, Time ~ variable | Diet, mean)## $`1`
## Time weight
## 1 0 41.40000
## 2 2 47.25000
## 3 4 56.47368
## 4 6 66.78947
## 5 8 79.68421
## 6 10 93.05263
## 7 12 108.52632
## 8 14 123.38889
## 9 16 144.64706
## 10 18 158.94118
## 11 20 170.41176
## 12 21 177.75000
##
## $`2`
## Time weight
## 1 0 40.7
## 2 2 49.4
## 3 4 59.8
## 4 6 75.4
## 5 8 91.7
## 6 10 108.5
## 7 12 131.3
## 8 14 141.9
## 9 16 164.7
## 10 18 187.7
## 11 20 205.6
## 12 21 214.7
##
## $`3`
## Time weight
## 1 0 40.8
## 2 2 50.4
## 3 4 62.2
## 4 6 77.9
## 5 8 98.4
## 6 10 117.1
## 7 12 144.4
## 8 14 164.5
## 9 16 197.4
## 10 18 233.1
## 11 20 258.9
## 12 21 270.3
##
## $`4`
## Time weight
## 1 0 41.0000
## 2 2 51.8000
## 3 4 64.5000
## 4 6 83.9000
## 5 8 105.6000
## 6 10 126.0000
## 7 12 151.4000
## 8 14 161.8000
## 9 16 182.0000
## 10 18 202.9000
## 11 20 233.8889
## 12 21 238.5556
# including all possible combinations
xChickWeight <- subset(ChickWeight, !(Diet == 1 & Time == 4))
mxChick <- reshape::melt(xChickWeight, measure.var = "weight")
head(reshape::cast(mxChick, Diet + Time ~ variable, median))## Diet Time weight
## 1 1 0 41
## 2 1 2 49
## 3 1 6 67
## 4 1 8 79
## 5 1 10 93
## 6 1 12 106
head(reshape::cast(mxChick, Diet + Time ~ variable, median, add.missing = TRUE))## Diet Time weight
## 1 1 0 41
## 2 1 2 49
## 3 1 4 NA
## 4 1 6 67
## 5 1 8 79
## 6 1 10 93
# using recast()
head(reshape::recast(xChickWeight, measure.var = "weight", Diet + Time ~ variable, median, add.missing = TRUE))## Diet Time weight
## 1 1 0 41
## 2 1 2 49
## 3 1 4 NA
## 4 1 6 67
## 5 1 8 79
## 6 1 10 93
group_by() is a very useful verb; as the name implies,
it allows you to create groups and then, for example, compute
descriptive statistics by groups. Once your data is grouped, the
operations that will follow will be executed inside each group.
Counting is one of the most common tasks you do when working with
data. Counting may sound simple, but it can get complicated quickly.
Intuitively, one would think that the count function counts the values
of discrete variables: The number of players on a team, the number of
cars, etc. However, count can also be used to calculate the
sum of a variable for a particular group or groups. count
creates a new data frame with the grouping variable and the frequency or
sum variable. This is not always what you want. Sometimes you want to
add counts to your existing data frame.
The argument wt stands for weighted counts. While count
calculates the frequency of values within a group without specifying the
wt argument (n = n()), wt
calculates the sum of a continuous variable for certain groups
(n = sum(<VARIABLE>)). This technique has
its advantages and disadvantages. On the positive side, we only need
three lines of code instead of six. On the downside, the code is less
explicit, and without knowing the inner workings of count, it’s hard to
tell that the function is calculating sums.
add_tally() does something similar than
add_count(). The only difference is that
add_tally calculates the sum of a given variable instead of
a count. add_tally has no argument for grouping the data.
You must accomplish this with group_by.
# loading the data
data(Gasoline, package = "plm")
gasoline <- as_tibble(Gasoline)
gasoline <- gasoline %>%
mutate(country = tolower(country))
# group_by()
gasoline %>%
group_by(country)## # A tibble: 342 × 6
## # Groups: country [18]
## country year lgaspcar lincomep lrpmg lcarpcap
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 austria 1960 4.17 -6.47 -0.335 -9.77
## 2 austria 1961 4.10 -6.43 -0.351 -9.61
## 3 austria 1962 4.07 -6.41 -0.380 -9.46
## 4 austria 1963 4.06 -6.37 -0.414 -9.34
## 5 austria 1964 4.04 -6.32 -0.445 -9.24
## 6 austria 1965 4.03 -6.29 -0.497 -9.12
## 7 austria 1966 4.05 -6.25 -0.467 -9.02
## 8 austria 1967 4.05 -6.23 -0.506 -8.93
## 9 austria 1968 4.05 -6.21 -0.522 -8.85
## 10 austria 1969 4.05 -6.15 -0.559 -8.79
## # … with 332 more rows
gasoline %>%
group_by(country, year)## # A tibble: 342 × 6
## # Groups: country, year [342]
## country year lgaspcar lincomep lrpmg lcarpcap
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 austria 1960 4.17 -6.47 -0.335 -9.77
## 2 austria 1961 4.10 -6.43 -0.351 -9.61
## 3 austria 1962 4.07 -6.41 -0.380 -9.46
## 4 austria 1963 4.06 -6.37 -0.414 -9.34
## 5 austria 1964 4.04 -6.32 -0.445 -9.24
## 6 austria 1965 4.03 -6.29 -0.497 -9.12
## 7 austria 1966 4.05 -6.25 -0.467 -9.02
## 8 austria 1967 4.05 -6.23 -0.506 -8.93
## 9 austria 1968 4.05 -6.21 -0.522 -8.85
## 10 austria 1969 4.05 -6.15 -0.559 -8.79
## # … with 332 more rows
gasoline %>%
group_by(country, year) %>%
ungroup()## # A tibble: 342 × 6
## country year lgaspcar lincomep lrpmg lcarpcap
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 austria 1960 4.17 -6.47 -0.335 -9.77
## 2 austria 1961 4.10 -6.43 -0.351 -9.61
## 3 austria 1962 4.07 -6.41 -0.380 -9.46
## 4 austria 1963 4.06 -6.37 -0.414 -9.34
## 5 austria 1964 4.04 -6.32 -0.445 -9.24
## 6 austria 1965 4.03 -6.29 -0.497 -9.12
## 7 austria 1966 4.05 -6.25 -0.467 -9.02
## 8 austria 1967 4.05 -6.23 -0.506 -8.93
## 9 austria 1968 4.05 -6.21 -0.522 -8.85
## 10 austria 1969 4.05 -6.15 -0.559 -8.79
## # … with 332 more rows
# getting summary statistics
gasoline %>%
group_by(country) %>%
dplyr::summarise(mean_gaspcar = mean(lgaspcar))## # A tibble: 18 × 2
## country mean_gaspcar
## <chr> <dbl>
## 1 austria 4.06
## 2 belgium 3.92
## 3 canada 4.86
## 4 denmark 4.19
## 5 france 3.82
## 6 germany 3.89
## 7 greece 4.88
## 8 ireland 4.23
## 9 italy 3.73
## 10 japan 4.70
## 11 netherla 4.08
## 12 norway 4.11
## 13 spain 4.06
## 14 sweden 4.01
## 15 switzerl 4.24
## 16 turkey 5.77
## 17 u.k. 3.98
## 18 u.s.a. 4.82
gasoline %>%
group_by(country) %>%
dplyr::summarise(mean_gaspcar = mean(lgaspcar)) %>%
filter(country == "france")## # A tibble: 1 × 2
## country mean_gaspcar
## <chr> <dbl>
## 1 france 3.82
desc_gasoline <- gasoline %>%
group_by(country) %>%
dplyr::summarise(
mean_gaspcar = mean(lgaspcar),
sd_gaspcar = sd(lgaspcar),
max_gaspcar = max(lgaspcar),
min_gaspcar = min(lgaspcar)
)
desc_gasoline %>%
filter(max(mean_gaspcar) == mean_gaspcar)## # A tibble: 1 × 5
## country mean_gaspcar sd_gaspcar max_gaspcar min_gaspcar
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 turkey 5.77 0.329 6.16 5.14
desc_gasoline %>%
filter(min(mean_gaspcar) == mean_gaspcar)## # A tibble: 1 × 5
## country mean_gaspcar sd_gaspcar max_gaspcar min_gaspcar
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 italy 3.73 0.220 4.05 3.38
# group_by() and across()
gasoline <- gasoline %>%
mutate(
year = as.character(year),
country = as.character(country)
)
gasoline %>%
group_by(across(is.character)) %>%
dplyr::summarise(mean(lincomep))## # A tibble: 342 × 3
## # Groups: country [18]
## country year `mean(lincomep)`
## <chr> <chr> <dbl>
## 1 austria 1960 -6.47
## 2 austria 1961 -6.43
## 3 austria 1962 -6.41
## 4 austria 1963 -6.37
## 5 austria 1964 -6.32
## 6 austria 1965 -6.29
## 7 austria 1966 -6.25
## 8 austria 1967 -6.23
## 9 austria 1968 -6.21
## 10 austria 1969 -6.15
## # … with 332 more rows
gasoline %>%
group_by(across(c(1, 2))) %>%
dplyr::summarise(mean(lincomep))## # A tibble: 342 × 3
## # Groups: country [18]
## country year `mean(lincomep)`
## <chr> <chr> <dbl>
## 1 austria 1960 -6.47
## 2 austria 1961 -6.43
## 3 austria 1962 -6.41
## 4 austria 1963 -6.37
## 5 austria 1964 -6.32
## 6 austria 1965 -6.29
## 7 austria 1966 -6.25
## 8 austria 1967 -6.23
## 9 austria 1968 -6.21
## 10 austria 1969 -6.15
## # … with 332 more rows
gasoline %>%
group_by(across(seq(1:2))) %>%
dplyr::summarise(mean(lincomep))## # A tibble: 342 × 3
## # Groups: country [18]
## country year `mean(lincomep)`
## <chr> <chr> <dbl>
## 1 austria 1960 -6.47
## 2 austria 1961 -6.43
## 3 austria 1962 -6.41
## 4 austria 1963 -6.37
## 5 austria 1964 -6.32
## 6 austria 1965 -6.29
## 7 austria 1966 -6.25
## 8 austria 1967 -6.23
## 9 austria 1968 -6.21
## 10 austria 1969 -6.15
## # … with 332 more rows
# how to count with continuous variables
starwars %>%
count(
decade = 10 * (birth_year %/% 10),
name = "characters_per_decade"
) %>%
glimpse()## Rows: 16
## Columns: 2
## $ decade <dbl> 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,…
## $ characters_per_decade <int> 1, 3, 4, 4, 9, 6, 4, 2, 2, 3, 1, 1, 1, 1, 1, 44
# summarise() across many columns
gasoline %>%
group_by(country) %>%
dplyr::summarise(across(starts_with("l"), mean))## # A tibble: 18 × 5
## country lgaspcar lincomep lrpmg lcarpcap
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 austria 4.06 -6.12 -0.486 -8.85
## 2 belgium 3.92 -5.85 -0.326 -8.63
## 3 canada 4.86 -5.58 -1.05 -8.08
## 4 denmark 4.19 -5.76 -0.358 -8.58
## 5 france 3.82 -5.87 -0.253 -8.45
## 6 germany 3.89 -5.85 -0.517 -8.51
## 7 greece 4.88 -6.61 -0.0339 -10.8
## 8 ireland 4.23 -6.44 -0.348 -9.04
## 9 italy 3.73 -6.35 -0.152 -8.83
## 10 japan 4.70 -6.25 -0.287 -9.95
## 11 netherla 4.08 -5.92 -0.370 -8.82
## 12 norway 4.11 -5.75 -0.278 -8.77
## 13 spain 4.06 -5.63 0.739 -9.90
## 14 sweden 4.01 -7.82 -2.71 -8.25
## 15 switzerl 4.24 -5.93 -0.902 -8.54
## 16 turkey 5.77 -7.34 -0.422 -12.5
## 17 u.k. 3.98 -6.02 -0.459 -8.55
## 18 u.s.a. 4.82 -5.45 -1.21 -7.78
# applying several functions to many columns at once
gasoline %>%
group_by(country) %>%
dplyr::summarise(across(starts_with("l"), tibble::lst(mean, sd, max, min), .names = "{fn}_{col}"))## # A tibble: 18 × 17
## country mean_lgasp…¹ sd_lg…² max_l…³ min_l…⁴ mean_…⁵ sd_li…⁶ max_l…⁷ min_l…⁸
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 austria 4.06 0.0693 4.20 3.92 -6.12 0.235 -5.76 -6.47
## 2 belgium 3.92 0.103 4.16 3.82 -5.85 0.227 -5.53 -6.22
## 3 canada 4.86 0.0262 4.90 4.81 -5.58 0.193 -5.31 -5.89
## 4 denmark 4.19 0.158 4.50 4.00 -5.76 0.176 -5.48 -6.06
## 5 france 3.82 0.0499 3.91 3.75 -5.87 0.241 -5.53 -6.26
## 6 germany 3.89 0.0239 3.93 3.85 -5.85 0.193 -5.56 -6.16
## 7 greece 4.88 0.255 5.38 4.48 -6.61 0.331 -6.15 -7.16
## 8 ireland 4.23 0.0437 4.33 4.16 -6.44 0.162 -6.19 -6.72
## 9 italy 3.73 0.220 4.05 3.38 -6.35 0.217 -6.08 -6.73
## 10 japan 4.70 0.684 6.00 3.95 -6.25 0.425 -5.71 -6.99
## 11 netherla 4.08 0.286 4.65 3.71 -5.92 0.193 -5.66 -6.22
## 12 norway 4.11 0.123 4.44 3.96 -5.75 0.201 -5.42 -6.09
## 13 spain 4.06 0.317 4.75 3.62 -5.63 0.278 -5.29 -6.17
## 14 sweden 4.01 0.0364 4.07 3.91 -7.82 0.126 -7.67 -8.07
## 15 switzerl 4.24 0.102 4.44 4.05 -5.93 0.124 -5.75 -6.16
## 16 turkey 5.77 0.329 6.16 5.14 -7.34 0.331 -6.89 -7.84
## 17 u.k. 3.98 0.0479 4.10 3.91 -6.02 0.107 -5.84 -6.19
## 18 u.s.a. 4.82 0.0219 4.86 4.79 -5.45 0.148 -5.22 -5.70
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, max_lrpmg <dbl>,
## # min_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## # max_lcarpcap <dbl>, min_lcarpcap <dbl>, and abbreviated variable names
## # ¹mean_lgaspcar, ²sd_lgaspcar, ³max_lgaspcar, ⁴min_lgaspcar, ⁵mean_lincomep,
## # ⁶sd_lincomep, ⁷max_lincomep, ⁸min_lincomep
gasoline %>%
group_by(country) %>%
dplyr::summarise(across(dplyr::contains("car"), tibble::lst(mean, sd, max, min), .names = "{fn}_{col}"))## # A tibble: 18 × 9
## country mean_lgasp…¹ sd_lg…² max_l…³ min_l…⁴ mean_…⁵ sd_lc…⁶ max_l…⁷ min_l…⁸
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 austria 4.06 0.0693 4.20 3.92 -8.85 0.473 -8.21 -9.77
## 2 belgium 3.92 0.103 4.16 3.82 -8.63 0.417 -8.10 -9.41
## 3 canada 4.86 0.0262 4.90 4.81 -8.08 0.195 -7.77 -8.38
## 4 denmark 4.19 0.158 4.50 4.00 -8.58 0.349 -8.20 -9.33
## 5 france 3.82 0.0499 3.91 3.75 -8.45 0.344 -8.01 -9.15
## 6 germany 3.89 0.0239 3.93 3.85 -8.51 0.406 -7.95 -9.34
## 7 greece 4.88 0.255 5.38 4.48 -10.8 0.839 -9.57 -12.2
## 8 ireland 4.23 0.0437 4.33 4.16 -9.04 0.345 -8.55 -9.70
## 9 italy 3.73 0.220 4.05 3.38 -8.83 0.639 -8.11 -10.1
## 10 japan 4.70 0.684 6.00 3.95 -9.95 1.20 -8.59 -12.2
## 11 netherla 4.08 0.286 4.65 3.71 -8.82 0.617 -8.16 -10.0
## 12 norway 4.11 0.123 4.44 3.96 -8.77 0.438 -8.17 -9.68
## 13 spain 4.06 0.317 4.75 3.62 -9.90 0.960 -8.63 -11.6
## 14 sweden 4.01 0.0364 4.07 3.91 -8.25 0.242 -7.96 -8.74
## 15 switzerl 4.24 0.102 4.44 4.05 -8.54 0.378 -8.03 -9.26
## 16 turkey 5.77 0.329 6.16 5.14 -12.5 0.751 -11.2 -13.5
## 17 u.k. 3.98 0.0479 4.10 3.91 -8.55 0.281 -8.26 -9.12
## 18 u.s.a. 4.82 0.0219 4.86 4.79 -7.78 0.162 -7.54 -8.02
## # … with abbreviated variable names ¹mean_lgaspcar, ²sd_lgaspcar,
## # ³max_lgaspcar, ⁴min_lgaspcar, ⁵mean_lcarpcap, ⁶sd_lcarpcap, ⁷max_lcarpcap,
## # ⁸min_lcarpcap
gasoline %>%
group_by(country) %>%
dplyr::summarise(across(is.numeric, tibble::lst(mean, sd, min, max), .names = "{fn}_{col}"))## # A tibble: 18 × 17
## country mean_lgasp…¹ sd_lg…² min_l…³ max_l…⁴ mean_…⁵ sd_li…⁶ min_l…⁷ max_l…⁸
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 austria 4.06 0.0693 3.92 4.20 -6.12 0.235 -6.47 -5.76
## 2 belgium 3.92 0.103 3.82 4.16 -5.85 0.227 -6.22 -5.53
## 3 canada 4.86 0.0262 4.81 4.90 -5.58 0.193 -5.89 -5.31
## 4 denmark 4.19 0.158 4.00 4.50 -5.76 0.176 -6.06 -5.48
## 5 france 3.82 0.0499 3.75 3.91 -5.87 0.241 -6.26 -5.53
## 6 germany 3.89 0.0239 3.85 3.93 -5.85 0.193 -6.16 -5.56
## 7 greece 4.88 0.255 4.48 5.38 -6.61 0.331 -7.16 -6.15
## 8 ireland 4.23 0.0437 4.16 4.33 -6.44 0.162 -6.72 -6.19
## 9 italy 3.73 0.220 3.38 4.05 -6.35 0.217 -6.73 -6.08
## 10 japan 4.70 0.684 3.95 6.00 -6.25 0.425 -6.99 -5.71
## 11 netherla 4.08 0.286 3.71 4.65 -5.92 0.193 -6.22 -5.66
## 12 norway 4.11 0.123 3.96 4.44 -5.75 0.201 -6.09 -5.42
## 13 spain 4.06 0.317 3.62 4.75 -5.63 0.278 -6.17 -5.29
## 14 sweden 4.01 0.0364 3.91 4.07 -7.82 0.126 -8.07 -7.67
## 15 switzerl 4.24 0.102 4.05 4.44 -5.93 0.124 -6.16 -5.75
## 16 turkey 5.77 0.329 5.14 6.16 -7.34 0.331 -7.84 -6.89
## 17 u.k. 3.98 0.0479 3.91 4.10 -6.02 0.107 -6.19 -5.84
## 18 u.s.a. 4.82 0.0219 4.79 4.86 -5.45 0.148 -5.70 -5.22
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, min_lrpmg <dbl>,
## # max_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## # min_lcarpcap <dbl>, max_lcarpcap <dbl>, and abbreviated variable names
## # ¹mean_lgaspcar, ²sd_lgaspcar, ³min_lgaspcar, ⁴max_lgaspcar, ⁵mean_lincomep,
## # ⁶sd_lincomep, ⁷min_lincomep, ⁸max_lincomep
gasoline %>%
select(-year) %>%
group_by(country) %>%
dplyr::summarise(across(everything(), tibble::lst(mean, sd, min, max), .names = "{fn}_{col}"))## # A tibble: 18 × 17
## country mean_lgasp…¹ sd_lg…² min_l…³ max_l…⁴ mean_…⁵ sd_li…⁶ min_l…⁷ max_l…⁸
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 austria 4.06 0.0693 3.92 4.20 -6.12 0.235 -6.47 -5.76
## 2 belgium 3.92 0.103 3.82 4.16 -5.85 0.227 -6.22 -5.53
## 3 canada 4.86 0.0262 4.81 4.90 -5.58 0.193 -5.89 -5.31
## 4 denmark 4.19 0.158 4.00 4.50 -5.76 0.176 -6.06 -5.48
## 5 france 3.82 0.0499 3.75 3.91 -5.87 0.241 -6.26 -5.53
## 6 germany 3.89 0.0239 3.85 3.93 -5.85 0.193 -6.16 -5.56
## 7 greece 4.88 0.255 4.48 5.38 -6.61 0.331 -7.16 -6.15
## 8 ireland 4.23 0.0437 4.16 4.33 -6.44 0.162 -6.72 -6.19
## 9 italy 3.73 0.220 3.38 4.05 -6.35 0.217 -6.73 -6.08
## 10 japan 4.70 0.684 3.95 6.00 -6.25 0.425 -6.99 -5.71
## 11 netherla 4.08 0.286 3.71 4.65 -5.92 0.193 -6.22 -5.66
## 12 norway 4.11 0.123 3.96 4.44 -5.75 0.201 -6.09 -5.42
## 13 spain 4.06 0.317 3.62 4.75 -5.63 0.278 -6.17 -5.29
## 14 sweden 4.01 0.0364 3.91 4.07 -7.82 0.126 -8.07 -7.67
## 15 switzerl 4.24 0.102 4.05 4.44 -5.93 0.124 -6.16 -5.75
## 16 turkey 5.77 0.329 5.14 6.16 -7.34 0.331 -7.84 -6.89
## 17 u.k. 3.98 0.0479 3.91 4.10 -6.02 0.107 -6.19 -5.84
## 18 u.s.a. 4.82 0.0219 4.79 4.86 -5.45 0.148 -5.70 -5.22
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, min_lrpmg <dbl>,
## # max_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## # min_lcarpcap <dbl>, max_lcarpcap <dbl>, and abbreviated variable names
## # ¹mean_lgaspcar, ²sd_lgaspcar, ³min_lgaspcar, ⁴max_lgaspcar, ⁵mean_lincomep,
## # ⁶sd_lincomep, ⁷min_lincomep, ⁸max_lincomep
# creating bins for continuous variables
starwars %>%
count(height_intervals = cut_width(height, 10))## # A tibble: 18 × 2
## height_intervals n
## <fct> <int>
## 1 [65,75] 1
## 2 (75,85] 1
## 3 (85,95] 2
## 4 (95,105] 3
## 5 (105,115] 1
## 6 (115,125] 1
## 7 (135,145] 1
## 8 (145,155] 2
## 9 (155,165] 7
## 10 (165,175] 14
## 11 (175,185] 20
## 12 (185,195] 12
## 13 (195,205] 7
## 14 (205,215] 3
## 15 (215,225] 2
## 16 (225,235] 3
## 17 (255,265] 1
## 18 <NA> 6
# calculating the sum of a variable based on groups
# using group_by() and summarise()
economics %>%
mutate(
year = format(date, "%Y")) %>%
group_by(year) %>%
dplyr::summarise(sum_unemploy = sum(unemploy, na.rm = TRUE))## # A tibble: 49 × 2
## year sum_unemploy
## <chr> <dbl>
## 1 1967 18074
## 2 1968 33569
## 3 1969 33962
## 4 1970 49528
## 5 1971 60260
## 6 1972 58510
## 7 1973 52312
## 8 1974 62080
## 9 1975 95275
## 10 1976 88778
## # … with 39 more rows
# using count()
economics %>%
count(year = format(date, "%Y"), wt = unemploy, name = "sum_unemploy")## # A tibble: 49 × 2
## year sum_unemploy
## <chr> <dbl>
## 1 1967 18074
## 2 1968 33569
## 3 1969 33962
## 4 1970 49528
## 5 1971 60260
## 6 1972 58510
## 7 1973 52312
## 8 1974 62080
## 9 1975 95275
## 10 1976 88778
## # … with 39 more rows
# adding counts as a variable to your data frame with add_count()
mpg %>%
add_count(manufacturer, name = "number_of_cars_by_manufacturer") %>%
select(manufacturer, model, number_of_cars_by_manufacturer) %>%
glimpse()## Rows: 234
## Columns: 3
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi",…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4…
## $ number_of_cars_by_manufacturer <int> 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,…
# adding a new variable to your data frame that contains the sum of a specific variable
mpg %>%
group_by(model) %>%
add_tally(wt = displ, name = "sum_display_per_model") %>%
select(manufacturer, model, sum_display_per_model) %>%
glimpse()## Rows: 234
## Columns: 3
## Groups: model [38]
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", …
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 qu…
## $ sum_display_per_model <dbl> 16.3, 16.3, 16.3, 16.3, 16.3, 16.3, 16.3, 19.4, …
To count the number of rows within a group the janitor’s
tabyl() function makes it really easy. While
tabyl() counted rows by group, janitor’s
adorn_percentages() function will calculate percentages in
a data frame, allowing you to choose whether the denominator for
dividing each item should be a sum by row or col. If you’d like the
results to look like conventional percents – multiplied by a hundred and
rounded with the percent sign included – add the
adorn_pct_formatting() function. However, this will turn
the percents into character strings because of the percent sign, which R
doesn’t recognize as part of a number. So only use that formatting if
you don’t need the data as numbers in your data frame. The syntax for
calculating percents that are non-rounded fractions:
adorn_percentages(mydf, denominator = "col") for
calculating by column. adorn_percentages(mydf) defaults by
row.
# using janitor::tabyl() function to count number of rows within a group
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import) %>%
filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
distinct(Contributor, Address, .keep_all = TRUE) %>%
tabyl(Recipient, sort = TRUE) %>%
# mutate(percent = round(percent * 100, 1)) %>%
select(Candidate = Recipient, Pct_Local_Contributors = percent)
contributions## Candidate Pct_Local_Contributors
## Horrigan, Joshua Paul 0.035820896
## Neves-Grigg, Sr., Benjaman 0.011940299
## Sen, Dhruba 0.008955224
## Sousa, Priscila 0.029850746
## Spicer, Dr. Yvonne M. 0.516417910
## Stefanini, John A. 0.337313433
## Tilden, Mark S. 0.059701493
# using adorn_percentages()
results <- readr::read_csv("input/election_framingham_mayor_2017_09.csv", col_names = TRUE) %>%
dplyr::select(Candidate, Totals)
results## # A tibble: 9 × 2
## Candidate Totals
## <chr> <dbl>
## 1 Blanks 56
## 2 Joshua Paul Horrigan 545
## 3 John A. Stefanini 3184
## 4 Dhruba P. Sen 101
## 5 Mark S. Tilden 439
## 6 Yvonne M. Spicer 5967
## 7 Benjaman A. Neves-Grigg, 134
## 8 Priscila Sousa 538
## 9 Write-Ins 42
results <- results %>%
filter(!(Candidate %in% c("Blanks", "Write-Ins"))) %>%
adorn_percentages(denominator = "col") %>%
rename(Pct_Vote = Totals)
results## Candidate Pct_Vote
## Joshua Paul Horrigan 0.049963330
## John A. Stefanini 0.291895856
## Dhruba P. Sen 0.009259259
## Mark S. Tilden 0.040245691
## Yvonne M. Spicer 0.547029703
## Benjaman A. Neves-Grigg, 0.012284562
## Priscila Sousa 0.049321599
Web scraping process:
It’s a World Wide Web convention that if a site wants to restrict automated bots from “crawling” its pages – either all pages or just some – it posts those details in a robots.txt file in its root directory. So, for a site at www.thesiteurl.com, robots.txt can be found at http://thesiteurl.com/robots.txt (or, in many cases, http://www.thesiteurl.com/robots.txt or https://www.thesiteurl.com/robots.txt).
To be a responsible and considerate Internet citizen, you should make
sure a site hasn’t refused bots and scripts from accessing its pages
before starting to scrape. You can look at these files manually – for
example, checking RStudio’s robots.txt file by going to https://wwwrstudio.com/robots.txt in a browser. But it’s
more elegant – and automated – to use the robotstxt package to check for
you. Plus, each time you run a scraper (if it’s one you want to use more
than once), you’ll be sure the site’s robots.txt hasn’t changed to
exclude you. An easy way to do this is using the package
robotstxt and the paths_allowed()
function.
SelectorGadget is a point-and-click tool that lets
you easily figure out what CSS selectors to use in order to extract a
portion of a Web page. CSS makes those more useful by categorizing those
broad HTML elements into subsets, using identifiers hopefully added by
the creator of an HTML page, such as class and id. When you’ve got
exactly the selection you want, look at the text at the bottom right of
the page: That’s your CSS selector. Copy it and paste it into an R
variable. Once you know where your data is, there are several steps to
parsing an HTML page with rvest and extracting data.
rvest’s read_html() function will return a list with
special R object classes of “xml_document” and “xml_node”. To extract
the portion of the page we want, we use CSS selectors and rvest’s
html_nodes() function. The page we read in is the first
argument to html_nodes(); the CSS selector is second.
There’s another rvest function that can extract a
specific attribute: the html_attr() function. For example,
for the ‘href’ attribute, html_attr(‘href’) would work. If instead of
the link attribute, you wanted the text for that link, you would use the
html_text() function.
Finally, we’d like to download the files at each of those links. We
can do that by applying base R’s download.file() function.
However, download.file() requires both a url (which we
have) and a file name (which we don’t have yet), using the syntax
download.file(myurl, myfilename). purrr
can help. It would be easiest, but pretty clunky, to use the URL as the
file name. We’ll be happier later if we extract a file name from the URL
and use that for the name. Fortunately, base R has a function to do just
that: basename(). We want to apply the
basename() function to all the URLs in my_urls and save the
results into an R vector of character strings. purrr’s
map_chr() will do just that.
Ideally, I’d like to use the vector of URLs and the vector of file
names when downloading files, so each URL is downloaded to a file with
the appropriate file name. In other words:
download.file(my_urls[1], my_filenames[1]),
download.file(my_urls[2], my_filenames[2]), and so on. Both
walk() and the map() family have sister
functions designed to do just that: apply a function to two data sets at
a time, one by one. For walk, it’s walk2(). For map, it’s
map2(), map2_df(),map2_chr(), and
so on.
In this case, we want to download the files, but the
download.file() function itself saves the file – there’s no
additional value we want to store. So, a walk() option is
the better choice. walk2() uses the syntax
walk2(myfirstvector, mysecondvector, myfunction) to apply
myfunction like
myfunction(myfirstvector[1], mysecondvector[1]), myfunction(myfirstvector[2], mysecondvector[2]), myfunction(myfirstvector[3], mysecondvector[3]),
and so on. This code applies download.file() to the URLs
and file names one by one in tandem, with the URLs as first argument and
file names as the second argument.
# 1. Checking
# checking robotstxt
paths_allowed("https://www.rstudio.com/resources/cheatsheets/")## [1] TRUE
# 2. Getting a list of all links we want
# selecting CSS (using previosly SelectorGadget!)
my_css <- "p .btn-primary"
# reading HTML page
my_html <- rvest::read_html("https://www.rstudio.com/resources/cheatsheets/")
# extracting the portion of the we want using CSS selectors
my_nodes <- rvest::html_nodes(my_html, my_css)
# all the items saved
head(my_nodes)## {xml_nodeset (6)}
## [1] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [2] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [3] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr ...
## [4] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [5] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/purrr ...
## [6] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/strin ...
# looking the items in the list
my_nodes[[2]]## {html_node}
## <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf" class="btn btn-primary">
# extracting the href attribute
my_urls <- rvest::html_nodes(my_html, my_css) %>%
rvest::html_attr('href')
head(my_urls)## [1] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-visualization.pdf"
## [2] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf"
## [3] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr.pdf"
## [4] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-import.pdf"
## [5] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/purrr.pdf"
## [6] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/strings.pdf"
# looking at the first item on the list
my_urls[2]## [1] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf"
# extracting the htext attribute
my_nodes_text <- rvest::html_nodes(my_html, my_css) %>%
rvest::html_text()
# looking at the first text
my_nodes_text[2]## [1] "Download"
# 3. Downloading files
# getting the file names
my_filenames <- map_chr(my_urls, basename)
# applying a function to two vectors at a time
# walk2(my_urls, my_filenames, download.file)
# for loop option
# for (i in seq_along(1:length(my_urls)) {
# download.file(my_urls[i], my_filenames[i])
# }
# another example
url <- "https://en.wikipedia.org/wiki/List_of_Nobel_laureates"
wiki <- rvest::read_html(url)
tables <- rvest::html_nodes(wiki, "table")
tables## {xml_nodeset (6)}
## [1] <table class="wikitable sortable"><tbody>\n<tr>\n<th>Year\n</th>\n<th wid ...
## [2] <table class="nowraplinks hlist mw-collapsible mw-collapsed navbox-inner" ...
## [3] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [4] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [5] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [6] <table class="nowraplinks mw-collapsible autocollapse navbox-inner" style ...
laureates <- rvest::html_table(tables[[1]], fill = TRUE)
head(laureates)## # A tibble: 6 × 7
## Year Physics Chemi…¹ Physi…² Liter…³ Peace Econo…⁴
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1901 Wilhelm Röntgen Jacobu… Emil A… Sully … Henr… —
## 2 1902 Hendrik Lorentz;Pieter Zeeman Herman… Ronald… Theodo… Élie… —
## 3 1903 Henri Becquerel;Pierre Curie;Mari… Svante… Niels … Bjørns… Rand… —
## 4 1904 Lord Rayleigh Willia… Ivan P… Frédér… Inst… —
## 5 1905 Philipp Lenard Adolf … Robert… Henryk… Bert… —
## 6 1906 J. J. Thomson Henri … Camill… Giosuè… Theo… —
## # … with abbreviated variable names ¹Chemistry, ²`Physiologyor Medicine`,
## # ³Literature, ⁴`Economics(The Sveriges Riksbank Prize)[13][a]`
The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.
# lintr::lint("main.Rmd", linters =
# lintr::with_defaults(
# commented_code_linter = NULL,
# trailing_whitespace_linter = NULL
# )
# )
# if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")